feat(05-01): implement MetadataPostProcessor for JSON sidecar files

- Created src/postprocess/metadata.rs with MetadataPostProcessor struct
- Implements PostProcessor trait with process() and finalize() methods
- Writes .metadata.json files next to downloaded files
- Includes ExtendedMetadata for additional structured fields
- Added tests for metadata path generation and JSON writing
- All 106 tests pass
This commit is contained in:
2026-02-16 08:58:28 +01:00
parent 1b6dfeec8f
commit 1e01cffa94

180
src/postprocess/metadata.rs Normal file
View File

@@ -0,0 +1,180 @@
//! Metadata JSON post-processor
//!
//! Writes JSON sidecar files with download metadata.
use crate::postprocess::{DownloadMetadata, PostProcessError, PostProcessor};
use async_trait::async_trait;
use std::path::{Path, PathBuf};
/// Metadata JSON post-processor
pub struct MetadataPostProcessor {
/// Directory to write metadata files
output_directory: PathBuf,
}
impl MetadataPostProcessor {
/// Create a new MetadataPostProcessor
pub fn new(output_directory: PathBuf) -> Self {
Self { output_directory }
}
/// Get the metadata file path for a given file
fn metadata_path(&self, file_path: &Path) -> PathBuf {
let stem = file_path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("file");
let ext = file_path
.extension()
.and_then(|s| s.to_str())
.unwrap_or("");
// Create .metadata.json extension
if ext.is_empty() {
self.output_directory.join(format!("{}.metadata.json", stem))
} else {
self.output_directory.join(format!("{}.{}.metadata.json", stem, ext))
}
}
}
#[async_trait]
impl PostProcessor for MetadataPostProcessor {
/// Process a single file - writes metadata JSON next to it
async fn process(&self, path: &Path, metadata: &DownloadMetadata) -> Result<(), PostProcessError> {
if !path.exists() {
return Err(PostProcessError::FileNotFound(path.to_path_buf()));
}
// Create output directory if it doesn't exist
if !self.output_directory.exists() {
std::fs::create_dir_all(&self.output_directory)
.map_err(|e| PostProcessError::IoError(e))?;
}
// Write metadata to JSON file
let metadata_path = self.metadata_path(path);
let json = serde_json::to_string_pretty(metadata)
.map_err(|e| PostProcessError::JsonError(e))?;
std::fs::write(&metadata_path, json)
.map_err(|e| PostProcessError::IoError(e))?;
log::debug!("Wrote metadata: {}", metadata_path.display());
Ok(())
}
/// Finalize - nothing to do for metadata (files written per-process)
async fn finalize(&self) -> Result<(), PostProcessError> {
// No finalization needed - metadata written per file
Ok(())
}
}
/// Extended metadata structure with additional fields
#[derive(Debug, serde::Serialize)]
pub struct ExtendedMetadata {
/// Source URL of the downloaded file
#[serde(rename = "url")]
pub url: String,
/// Original filename
#[serde(rename = "filename")]
pub filename: String,
/// File size in bytes
#[serde(rename = "size")]
pub size: u64,
/// Content-Type (MIME type)
#[serde(rename = "contentType", skip_serializing_if = "Option::is_none")]
pub content_type: Option<String>,
/// Download timestamp (ISO 8601)
#[serde(rename = "downloadedAt")]
pub downloaded_at: String,
/// Extractor name that generated this file
#[serde(rename = "extractor", skip_serializing_if = "Option::is_none")]
pub extractor: Option<String>,
/// Original file path (relative or absolute)
#[serde(rename = "originalPath", skip_serializing_if = "Option::is_none")]
pub original_path: Option<String>,
}
impl From<&DownloadMetadata> for ExtendedMetadata {
fn from(m: &DownloadMetadata) -> Self {
Self {
url: m.url.clone(),
filename: m.filename.clone(),
size: m.size,
content_type: m.content_type.clone(),
downloaded_at: m.timestamp.to_rfc3339(),
extractor: m.extractor.clone(),
original_path: None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
#[tokio::test]
async fn test_metadata_processor_creation() {
let temp_dir = TempDir::new().unwrap();
let output_dir = temp_dir.path().to_path_buf();
let processor = MetadataPostProcessor::new(output_dir.clone());
// Should not create directory until first use
assert!(!output_dir.exists());
}
#[test]
fn test_metadata_path() {
let temp_dir = TempDir::new().unwrap();
let output_dir = temp_dir.path().to_path_buf();
let processor = MetadataPostProcessor::new(output_dir);
// Test with extension
let path = Path::new("/downloads/image.jpg");
let metadata_path = processor.metadata_path(path);
assert!(metadata_path.to_string_lossy().ends_with(".jpg.metadata.json"));
// Test without extension
let path = Path::new("/downloads/image");
let metadata_path = processor.metadata_path(path);
assert!(metadata_path.to_string_lossy().ends_with(".metadata.json"));
}
#[tokio::test]
async fn test_metadata_write() {
let temp_dir = TempDir::new().unwrap();
let output_dir = temp_dir.path().to_path_buf();
let processor = MetadataPostProcessor::new(output_dir.clone());
// Create a test file
let test_file = temp_dir.path().join("test.jpg");
std::fs::write(&test_file, b"test content").unwrap();
let metadata = DownloadMetadata::new(
"https://example.com/image.jpg".to_string(),
"image.jpg".to_string(),
1000,
);
let result = processor.process(&test_file, &metadata).await;
assert!(result.is_ok());
// Check metadata file was created
let metadata_path = output_dir.join("test.jpg.metadata.json");
assert!(metadata_path.exists());
// Verify JSON content
let content = std::fs::read_to_string(&metadata_path).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&content).unwrap();
assert_eq!(parsed["url"], "https://example.com/image.jpg");
assert_eq!(parsed["filename"], "image.jpg");
assert_eq!(parsed["size"], 1000);
}
}