diff --git a/src/postprocess/metadata.rs b/src/postprocess/metadata.rs new file mode 100644 index 00000000..94526874 --- /dev/null +++ b/src/postprocess/metadata.rs @@ -0,0 +1,180 @@ +//! Metadata JSON post-processor +//! +//! Writes JSON sidecar files with download metadata. + +use crate::postprocess::{DownloadMetadata, PostProcessError, PostProcessor}; +use async_trait::async_trait; +use std::path::{Path, PathBuf}; + +/// Metadata JSON post-processor +pub struct MetadataPostProcessor { + /// Directory to write metadata files + output_directory: PathBuf, +} + +impl MetadataPostProcessor { + /// Create a new MetadataPostProcessor + pub fn new(output_directory: PathBuf) -> Self { + Self { output_directory } + } + + /// Get the metadata file path for a given file + fn metadata_path(&self, file_path: &Path) -> PathBuf { + let stem = file_path + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or("file"); + + let ext = file_path + .extension() + .and_then(|s| s.to_str()) + .unwrap_or(""); + + // Create .metadata.json extension + if ext.is_empty() { + self.output_directory.join(format!("{}.metadata.json", stem)) + } else { + self.output_directory.join(format!("{}.{}.metadata.json", stem, ext)) + } + } +} + +#[async_trait] +impl PostProcessor for MetadataPostProcessor { + /// Process a single file - writes metadata JSON next to it + async fn process(&self, path: &Path, metadata: &DownloadMetadata) -> Result<(), PostProcessError> { + if !path.exists() { + return Err(PostProcessError::FileNotFound(path.to_path_buf())); + } + + // Create output directory if it doesn't exist + if !self.output_directory.exists() { + std::fs::create_dir_all(&self.output_directory) + .map_err(|e| PostProcessError::IoError(e))?; + } + + // Write metadata to JSON file + let metadata_path = self.metadata_path(path); + let json = serde_json::to_string_pretty(metadata) + .map_err(|e| PostProcessError::JsonError(e))?; + + std::fs::write(&metadata_path, json) + .map_err(|e| PostProcessError::IoError(e))?; + + log::debug!("Wrote metadata: {}", metadata_path.display()); + Ok(()) + } + + /// Finalize - nothing to do for metadata (files written per-process) + async fn finalize(&self) -> Result<(), PostProcessError> { + // No finalization needed - metadata written per file + Ok(()) + } +} + +/// Extended metadata structure with additional fields +#[derive(Debug, serde::Serialize)] +pub struct ExtendedMetadata { + /// Source URL of the downloaded file + #[serde(rename = "url")] + pub url: String, + /// Original filename + #[serde(rename = "filename")] + pub filename: String, + /// File size in bytes + #[serde(rename = "size")] + pub size: u64, + /// Content-Type (MIME type) + #[serde(rename = "contentType", skip_serializing_if = "Option::is_none")] + pub content_type: Option, + /// Download timestamp (ISO 8601) + #[serde(rename = "downloadedAt")] + pub downloaded_at: String, + /// Extractor name that generated this file + #[serde(rename = "extractor", skip_serializing_if = "Option::is_none")] + pub extractor: Option, + /// Original file path (relative or absolute) + #[serde(rename = "originalPath", skip_serializing_if = "Option::is_none")] + pub original_path: Option, +} + +impl From<&DownloadMetadata> for ExtendedMetadata { + fn from(m: &DownloadMetadata) -> Self { + Self { + url: m.url.clone(), + filename: m.filename.clone(), + size: m.size, + content_type: m.content_type.clone(), + downloaded_at: m.timestamp.to_rfc3339(), + extractor: m.extractor.clone(), + original_path: None, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[tokio::test] + async fn test_metadata_processor_creation() { + let temp_dir = TempDir::new().unwrap(); + let output_dir = temp_dir.path().to_path_buf(); + + let processor = MetadataPostProcessor::new(output_dir.clone()); + + // Should not create directory until first use + assert!(!output_dir.exists()); + } + + #[test] + fn test_metadata_path() { + let temp_dir = TempDir::new().unwrap(); + let output_dir = temp_dir.path().to_path_buf(); + + let processor = MetadataPostProcessor::new(output_dir); + + // Test with extension + let path = Path::new("/downloads/image.jpg"); + let metadata_path = processor.metadata_path(path); + assert!(metadata_path.to_string_lossy().ends_with(".jpg.metadata.json")); + + // Test without extension + let path = Path::new("/downloads/image"); + let metadata_path = processor.metadata_path(path); + assert!(metadata_path.to_string_lossy().ends_with(".metadata.json")); + } + + #[tokio::test] + async fn test_metadata_write() { + let temp_dir = TempDir::new().unwrap(); + let output_dir = temp_dir.path().to_path_buf(); + + let processor = MetadataPostProcessor::new(output_dir.clone()); + + // Create a test file + let test_file = temp_dir.path().join("test.jpg"); + std::fs::write(&test_file, b"test content").unwrap(); + + let metadata = DownloadMetadata::new( + "https://example.com/image.jpg".to_string(), + "image.jpg".to_string(), + 1000, + ); + + let result = processor.process(&test_file, &metadata).await; + assert!(result.is_ok()); + + // Check metadata file was created + let metadata_path = output_dir.join("test.jpg.metadata.json"); + assert!(metadata_path.exists()); + + // Verify JSON content + let content = std::fs::read_to_string(&metadata_path).unwrap(); + let parsed: serde_json::Value = serde_json::from_str(&content).unwrap(); + assert_eq!(parsed["url"], "https://example.com/image.jpg"); + assert_eq!(parsed["filename"], "image.jpg"); + assert_eq!(parsed["size"], 1000); + } +}