feat(05-01): implement MetadataPostProcessor for JSON sidecar files
- Created src/postprocess/metadata.rs with MetadataPostProcessor struct - Implements PostProcessor trait with process() and finalize() methods - Writes .metadata.json files next to downloaded files - Includes ExtendedMetadata for additional structured fields - Added tests for metadata path generation and JSON writing - All 106 tests pass
This commit is contained in:
180
src/postprocess/metadata.rs
Normal file
180
src/postprocess/metadata.rs
Normal file
@@ -0,0 +1,180 @@
|
||||
//! Metadata JSON post-processor
|
||||
//!
|
||||
//! Writes JSON sidecar files with download metadata.
|
||||
|
||||
use crate::postprocess::{DownloadMetadata, PostProcessError, PostProcessor};
|
||||
use async_trait::async_trait;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
/// Metadata JSON post-processor
|
||||
pub struct MetadataPostProcessor {
|
||||
/// Directory to write metadata files
|
||||
output_directory: PathBuf,
|
||||
}
|
||||
|
||||
impl MetadataPostProcessor {
|
||||
/// Create a new MetadataPostProcessor
|
||||
pub fn new(output_directory: PathBuf) -> Self {
|
||||
Self { output_directory }
|
||||
}
|
||||
|
||||
/// Get the metadata file path for a given file
|
||||
fn metadata_path(&self, file_path: &Path) -> PathBuf {
|
||||
let stem = file_path
|
||||
.file_stem()
|
||||
.and_then(|s| s.to_str())
|
||||
.unwrap_or("file");
|
||||
|
||||
let ext = file_path
|
||||
.extension()
|
||||
.and_then(|s| s.to_str())
|
||||
.unwrap_or("");
|
||||
|
||||
// Create .metadata.json extension
|
||||
if ext.is_empty() {
|
||||
self.output_directory.join(format!("{}.metadata.json", stem))
|
||||
} else {
|
||||
self.output_directory.join(format!("{}.{}.metadata.json", stem, ext))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PostProcessor for MetadataPostProcessor {
|
||||
/// Process a single file - writes metadata JSON next to it
|
||||
async fn process(&self, path: &Path, metadata: &DownloadMetadata) -> Result<(), PostProcessError> {
|
||||
if !path.exists() {
|
||||
return Err(PostProcessError::FileNotFound(path.to_path_buf()));
|
||||
}
|
||||
|
||||
// Create output directory if it doesn't exist
|
||||
if !self.output_directory.exists() {
|
||||
std::fs::create_dir_all(&self.output_directory)
|
||||
.map_err(|e| PostProcessError::IoError(e))?;
|
||||
}
|
||||
|
||||
// Write metadata to JSON file
|
||||
let metadata_path = self.metadata_path(path);
|
||||
let json = serde_json::to_string_pretty(metadata)
|
||||
.map_err(|e| PostProcessError::JsonError(e))?;
|
||||
|
||||
std::fs::write(&metadata_path, json)
|
||||
.map_err(|e| PostProcessError::IoError(e))?;
|
||||
|
||||
log::debug!("Wrote metadata: {}", metadata_path.display());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Finalize - nothing to do for metadata (files written per-process)
|
||||
async fn finalize(&self) -> Result<(), PostProcessError> {
|
||||
// No finalization needed - metadata written per file
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Extended metadata structure with additional fields
|
||||
#[derive(Debug, serde::Serialize)]
|
||||
pub struct ExtendedMetadata {
|
||||
/// Source URL of the downloaded file
|
||||
#[serde(rename = "url")]
|
||||
pub url: String,
|
||||
/// Original filename
|
||||
#[serde(rename = "filename")]
|
||||
pub filename: String,
|
||||
/// File size in bytes
|
||||
#[serde(rename = "size")]
|
||||
pub size: u64,
|
||||
/// Content-Type (MIME type)
|
||||
#[serde(rename = "contentType", skip_serializing_if = "Option::is_none")]
|
||||
pub content_type: Option<String>,
|
||||
/// Download timestamp (ISO 8601)
|
||||
#[serde(rename = "downloadedAt")]
|
||||
pub downloaded_at: String,
|
||||
/// Extractor name that generated this file
|
||||
#[serde(rename = "extractor", skip_serializing_if = "Option::is_none")]
|
||||
pub extractor: Option<String>,
|
||||
/// Original file path (relative or absolute)
|
||||
#[serde(rename = "originalPath", skip_serializing_if = "Option::is_none")]
|
||||
pub original_path: Option<String>,
|
||||
}
|
||||
|
||||
impl From<&DownloadMetadata> for ExtendedMetadata {
|
||||
fn from(m: &DownloadMetadata) -> Self {
|
||||
Self {
|
||||
url: m.url.clone(),
|
||||
filename: m.filename.clone(),
|
||||
size: m.size,
|
||||
content_type: m.content_type.clone(),
|
||||
downloaded_at: m.timestamp.to_rfc3339(),
|
||||
extractor: m.extractor.clone(),
|
||||
original_path: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_processor_creation() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let output_dir = temp_dir.path().to_path_buf();
|
||||
|
||||
let processor = MetadataPostProcessor::new(output_dir.clone());
|
||||
|
||||
// Should not create directory until first use
|
||||
assert!(!output_dir.exists());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_metadata_path() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let output_dir = temp_dir.path().to_path_buf();
|
||||
|
||||
let processor = MetadataPostProcessor::new(output_dir);
|
||||
|
||||
// Test with extension
|
||||
let path = Path::new("/downloads/image.jpg");
|
||||
let metadata_path = processor.metadata_path(path);
|
||||
assert!(metadata_path.to_string_lossy().ends_with(".jpg.metadata.json"));
|
||||
|
||||
// Test without extension
|
||||
let path = Path::new("/downloads/image");
|
||||
let metadata_path = processor.metadata_path(path);
|
||||
assert!(metadata_path.to_string_lossy().ends_with(".metadata.json"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_write() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let output_dir = temp_dir.path().to_path_buf();
|
||||
|
||||
let processor = MetadataPostProcessor::new(output_dir.clone());
|
||||
|
||||
// Create a test file
|
||||
let test_file = temp_dir.path().join("test.jpg");
|
||||
std::fs::write(&test_file, b"test content").unwrap();
|
||||
|
||||
let metadata = DownloadMetadata::new(
|
||||
"https://example.com/image.jpg".to_string(),
|
||||
"image.jpg".to_string(),
|
||||
1000,
|
||||
);
|
||||
|
||||
let result = processor.process(&test_file, &metadata).await;
|
||||
assert!(result.is_ok());
|
||||
|
||||
// Check metadata file was created
|
||||
let metadata_path = output_dir.join("test.jpg.metadata.json");
|
||||
assert!(metadata_path.exists());
|
||||
|
||||
// Verify JSON content
|
||||
let content = std::fs::read_to_string(&metadata_path).unwrap();
|
||||
let parsed: serde_json::Value = serde_json::from_str(&content).unwrap();
|
||||
assert_eq!(parsed["url"], "https://example.com/image.jpg");
|
||||
assert_eq!(parsed["filename"], "image.jpg");
|
||||
assert_eq!(parsed["size"], 1000);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user