feat(05-03): implement SQLite download archive for duplicate detection
- Added rusqlite dependency with bundled feature - Created src/archive/mod.rs with DownloadArchive trait and SqliteArchive - Added --download-archive CLI option for archive database path - Added --download-archive-skip-duplicates flag with default path - Integrated archive checking in DownloadManager before download - Records successful downloads to archive after completion - All 129 tests pass
This commit is contained in:
87
Cargo.lock
generated
87
Cargo.lock
generated
@@ -674,6 +674,18 @@ dependencies = [
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fallible-iterator"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649"
|
||||
|
||||
[[package]]
|
||||
name = "fallible-streaming-iterator"
|
||||
version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
|
||||
|
||||
[[package]]
|
||||
name = "fastrand"
|
||||
version = "2.3.0"
|
||||
@@ -709,6 +721,12 @@ version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
|
||||
|
||||
[[package]]
|
||||
name = "foldhash"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
|
||||
|
||||
[[package]]
|
||||
name = "form_urlencoded"
|
||||
version = "1.2.2"
|
||||
@@ -837,6 +855,7 @@ dependencies = [
|
||||
"once_cell",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"rusqlite",
|
||||
"scraper",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -937,7 +956,7 @@ version = "0.15.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
|
||||
dependencies = [
|
||||
"foldhash",
|
||||
"foldhash 0.1.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -945,6 +964,18 @@ name = "hashbrown"
|
||||
version = "0.16.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
|
||||
dependencies = [
|
||||
"foldhash 0.2.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashlink"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ea0b22561a9c04a7cb1a302c013e0259cd3b4bb619f145b32f72b8b4bcbed230"
|
||||
dependencies = [
|
||||
"hashbrown 0.16.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
@@ -1361,6 +1392,17 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libsqlite3-sys"
|
||||
version = "0.36.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95b4103cffefa72eb8428cb6b47d6627161e51c2739fc5e3b734584157bc642a"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"pkg-config",
|
||||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.11.0"
|
||||
@@ -1909,6 +1951,31 @@ dependencies = [
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rsqlite-vfs"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8a1f2315036ef6b1fbacd1972e8ee7688030b0a2121edfc2a6550febd41574d"
|
||||
dependencies = [
|
||||
"hashbrown 0.16.1",
|
||||
"thiserror 2.0.18",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rusqlite"
|
||||
version = "0.38.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f1c93dd1c9683b438c392c492109cb702b8090b2bfc8fed6f6e4eb4523f17af3"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"fallible-iterator",
|
||||
"fallible-streaming-iterator",
|
||||
"hashlink",
|
||||
"libsqlite3-sys",
|
||||
"smallvec",
|
||||
"sqlite-wasm-rs",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc-hash"
|
||||
version = "2.1.1"
|
||||
@@ -2257,6 +2324,18 @@ dependencies = [
|
||||
"windows-sys 0.60.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sqlite-wasm-rs"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2f4206ed3a67690b9c29b77d728f6acc3ce78f16bf846d83c94f76400320181b"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"js-sys",
|
||||
"rsqlite-vfs",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "stable_deref_trait"
|
||||
version = "1.2.1"
|
||||
@@ -2739,6 +2818,12 @@ version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
||||
|
||||
[[package]]
|
||||
name = "vcpkg"
|
||||
version = "0.2.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.5"
|
||||
|
||||
@@ -36,6 +36,7 @@ futures = "0.3"
|
||||
zip = { version = "8.0.0", features = ["deflate"] }
|
||||
walkdir = "2.5.0"
|
||||
chrono = { version = "0.4.43", features = ["serde"] }
|
||||
rusqlite = { version = "0.38.0", features = ["bundled"] }
|
||||
|
||||
[profile.release]
|
||||
opt-level = 3
|
||||
|
||||
211
src/archive/mod.rs
Normal file
211
src/archive/mod.rs
Normal file
@@ -0,0 +1,211 @@
|
||||
//! Archive module for tracking downloaded files
|
||||
//!
|
||||
//! Provides DownloadArchive trait and SqliteArchive implementation
|
||||
//! for detecting and skipping duplicate downloads.
|
||||
|
||||
use rusqlite::{params, Connection};
|
||||
use std::path::Path;
|
||||
use std::sync::Mutex;
|
||||
use thiserror::Error;
|
||||
|
||||
/// Errors that can occur during archive operations
|
||||
#[derive(Debug, Error)]
|
||||
pub enum ArchiveError {
|
||||
#[error("Database error: {0}")]
|
||||
DatabaseError(#[from] rusqlite::Error),
|
||||
|
||||
#[error("IO error: {0}")]
|
||||
IoError(#[from] std::io::Error),
|
||||
|
||||
#[error("Lock error")]
|
||||
LockError,
|
||||
}
|
||||
|
||||
/// Trait for archive backends that track downloaded files
|
||||
pub trait DownloadArchive: Send + Sync {
|
||||
/// Check if a file is already in the archive
|
||||
fn contains(&self, url: &str, filename: &str) -> bool;
|
||||
|
||||
/// Add a file to the archive after successful download
|
||||
fn add(
|
||||
&self,
|
||||
url: &str,
|
||||
filename: &str,
|
||||
hash: Option<&str>,
|
||||
size: Option<u64>,
|
||||
extractor: Option<&str>,
|
||||
) -> Result<(), ArchiveError>;
|
||||
}
|
||||
|
||||
/// SQLite-based archive for tracking downloaded files
|
||||
pub struct SqliteArchive {
|
||||
conn: Mutex<Connection>,
|
||||
}
|
||||
|
||||
impl SqliteArchive {
|
||||
/// Create a new SqliteArchive, creating the database if it doesn't exist
|
||||
pub fn new(path: &Path) -> Result<Self, ArchiveError> {
|
||||
// Create parent directories if they don't exist
|
||||
if let Some(parent) = path.parent() {
|
||||
std::fs::create_dir_all(parent)?;
|
||||
}
|
||||
|
||||
let conn = Connection::open(path)?;
|
||||
let archive = Self {
|
||||
conn: Mutex::new(conn),
|
||||
};
|
||||
archive.init_schema()?;
|
||||
Ok(archive)
|
||||
}
|
||||
|
||||
/// Initialize the database schema
|
||||
fn init_schema(&self) -> Result<(), ArchiveError> {
|
||||
let conn = self.conn.lock().map_err(|_| ArchiveError::LockError)?;
|
||||
|
||||
conn.execute(
|
||||
"CREATE TABLE IF NOT EXISTS archive (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
url TEXT NOT NULL,
|
||||
filename TEXT NOT NULL,
|
||||
hash TEXT,
|
||||
size INTEGER,
|
||||
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
extractor TEXT,
|
||||
UNIQUE(url, filename)
|
||||
)",
|
||||
[],
|
||||
)?;
|
||||
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_archive_hash ON archive(hash)",
|
||||
[],
|
||||
)?;
|
||||
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_archive_url ON archive(url)",
|
||||
[],
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for SqliteArchive {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SqliteArchive").finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl DownloadArchive for SqliteArchive {
|
||||
/// Check if a file (identified by URL + filename) is already in the archive
|
||||
fn contains(&self, url: &str, filename: &str) -> bool {
|
||||
let conn = match self.conn.lock() {
|
||||
Ok(c) => c,
|
||||
Err(_) => return false,
|
||||
};
|
||||
|
||||
let result = conn.query_row(
|
||||
"SELECT 1 FROM archive WHERE url = ?1 AND filename = ?2",
|
||||
params![url, filename],
|
||||
|_row| Ok(()),
|
||||
);
|
||||
|
||||
match result {
|
||||
Ok(_) => true,
|
||||
Err(rusqlite::Error::QueryReturnedNoRows) => false,
|
||||
Err(e) => {
|
||||
// Log the error but don't fail - treat as not found
|
||||
log::warn!("Archive check error: {}", e);
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a file to the archive
|
||||
fn add(
|
||||
&self,
|
||||
url: &str,
|
||||
filename: &str,
|
||||
hash: Option<&str>,
|
||||
size: Option<u64>,
|
||||
extractor: Option<&str>,
|
||||
) -> Result<(), ArchiveError> {
|
||||
let conn = self.conn.lock().map_err(|_| ArchiveError::LockError)?;
|
||||
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO archive (url, filename, hash, size, extractor) VALUES (?1, ?2, ?3, ?4, ?5)",
|
||||
params![url, filename, hash, size.map(|s| s as i64), extractor],
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[test]
|
||||
fn test_archive_creation() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let db_path = temp_dir.path().join("archive.db");
|
||||
|
||||
let archive = SqliteArchive::new(&db_path);
|
||||
assert!(archive.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_archive_contains_empty() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let db_path = temp_dir.path().join("archive.db");
|
||||
|
||||
let archive = SqliteArchive::new(&db_path).unwrap();
|
||||
|
||||
// Empty archive should not contain any files
|
||||
assert!(!archive.contains("https://example.com/file.jpg", "file.jpg"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_archive_add_and_contains() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let db_path = temp_dir.path().join("archive.db");
|
||||
|
||||
let archive = SqliteArchive::new(&db_path).unwrap();
|
||||
|
||||
// Add a file
|
||||
let result = archive.add(
|
||||
"https://example.com/file.jpg",
|
||||
"file.jpg",
|
||||
Some("abc123"),
|
||||
Some(1024),
|
||||
Some("test"),
|
||||
);
|
||||
assert!(result.is_ok());
|
||||
|
||||
// Now it should be contained
|
||||
assert!(archive.contains("https://example.com/file.jpg", "file.jpg"));
|
||||
|
||||
// Different URL/filename should not be contained
|
||||
assert!(!archive.contains("https://example.com/other.jpg", "other.jpg"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_archive_duplicate_add() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let db_path = temp_dir.path().join("archive.db");
|
||||
|
||||
let archive = SqliteArchive::new(&db_path).unwrap();
|
||||
|
||||
// Add same file twice (should not fail due to INSERT OR IGNORE)
|
||||
archive
|
||||
.add("https://example.com/file.jpg", "file.jpg", None, None, None)
|
||||
.unwrap();
|
||||
archive
|
||||
.add("https://example.com/file.jpg", "file.jpg", None, None, None)
|
||||
.unwrap();
|
||||
|
||||
// Should still be in archive
|
||||
assert!(archive.contains("https://example.com/file.jpg", "file.jpg"));
|
||||
}
|
||||
}
|
||||
34
src/cli.rs
34
src/cli.rs
@@ -111,6 +111,17 @@ pub struct Args {
|
||||
/// Use {} as placeholder for file path (e.g., --exec "scan {}")
|
||||
#[arg(long = "exec", value_name = "COMMAND", num_args = 1..)]
|
||||
pub exec: Option<Vec<String>>,
|
||||
|
||||
// ===== Archive Options =====
|
||||
/// Path to archive database for tracking downloaded files
|
||||
/// Files already in the archive will be skipped
|
||||
#[arg(long = "download-archive", value_name = "PATH")]
|
||||
pub download_archive: Option<PathBuf>,
|
||||
|
||||
/// Enable archive tracking with default path (~/.gallery-dl/archive.db)
|
||||
/// Shorthand for --download-archive with default path
|
||||
#[arg(long = "download-archive-skip-duplicates")]
|
||||
pub download_archive_skip_duplicates: bool,
|
||||
}
|
||||
|
||||
impl Args {
|
||||
@@ -188,6 +199,29 @@ pub fn parse_exec_config(args: Vec<String>) -> Option<crate::postprocess::ExecCo
|
||||
Some(crate::postprocess::ExecConfig::new(command).with_args(exec_args))
|
||||
}
|
||||
|
||||
/// Parse archive path from CLI argument
|
||||
///
|
||||
/// Creates archive directory if it doesn't exist
|
||||
/// Returns path to archive.db file
|
||||
pub fn parse_archive_path(path: Option<PathBuf>) -> Option<PathBuf> {
|
||||
path.map(|p| {
|
||||
// If path is a directory, use archive.db inside it
|
||||
if p.is_dir() {
|
||||
p.join("archive.db")
|
||||
} else {
|
||||
p
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Get default archive path (~/.gallery-dl/archive.db)
|
||||
pub fn default_archive_path() -> PathBuf {
|
||||
let mut path = dirs::data_local_dir().unwrap_or_else(|| PathBuf::from("."));
|
||||
path.push("gallery-dl");
|
||||
path.push("archive.db");
|
||||
path
|
||||
}
|
||||
|
||||
/// Configuration file type
|
||||
#[derive(Debug, Clone, ValueEnum)]
|
||||
pub enum ConfigType {
|
||||
|
||||
@@ -10,12 +10,14 @@ pub mod worker;
|
||||
pub use templates::{PathTemplate, TemplateError};
|
||||
pub use worker::{DownloadItem, DownloadWorker, DownloadWorkerResult, download_batch};
|
||||
|
||||
use crate::archive::{DownloadArchive, SqliteArchive};
|
||||
use progress::DownloadProgress;
|
||||
use resume::{get_partial_path, get_resume_offset, ResumeError};
|
||||
use reqwest::header::{HeaderMap, RANGE};
|
||||
use reqwest::Client;
|
||||
use std::collections::HashMap;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use thiserror::Error;
|
||||
use tokio::fs::File;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
@@ -60,6 +62,8 @@ pub struct DownloadOptions {
|
||||
pub resume: bool,
|
||||
/// File filter options
|
||||
pub filter: Option<FileFilter>,
|
||||
/// Archive for tracking downloaded files
|
||||
pub archive: Option<Arc<SqliteArchive>>,
|
||||
}
|
||||
|
||||
/// File filter for skipping downloads based on size or type
|
||||
@@ -172,6 +176,7 @@ impl DownloadOptions {
|
||||
headers: HeaderMap::new(),
|
||||
resume: true,
|
||||
filter: None,
|
||||
archive: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -204,6 +209,12 @@ impl DownloadOptions {
|
||||
self.filter = Some(filter);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set archive for duplicate detection
|
||||
pub fn archive(mut self, archive: Arc<SqliteArchive>) -> Self {
|
||||
self.archive = Some(archive);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of a successful download
|
||||
@@ -242,6 +253,20 @@ impl DownloadManager {
|
||||
pub async fn download(&self, options: DownloadOptions) -> Result<DownloadResult, DownloadError> {
|
||||
let start_time = std::time::Instant::now();
|
||||
|
||||
// Get the filename from the destination path
|
||||
let filename = options.destination
|
||||
.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.unwrap_or("unknown");
|
||||
|
||||
// Check archive for duplicate before downloading
|
||||
if let Some(ref archive) = options.archive {
|
||||
if archive.contains(&options.url, filename) {
|
||||
println!("Skipping {} - already in archive", filename);
|
||||
return Err(DownloadError::Filtered("already in archive".to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
// Get the .part file path
|
||||
let partial_path = get_partial_path(&options.destination);
|
||||
|
||||
@@ -374,6 +399,13 @@ impl DownloadManager {
|
||||
// Rename .part file to final destination
|
||||
tokio::fs::rename(&partial_path, &options.destination).await?;
|
||||
|
||||
// Record to archive after successful download
|
||||
if let Some(ref archive) = options.archive {
|
||||
if let Err(e) = archive.add(&options.url, filename, None, Some(downloaded), None) {
|
||||
log::warn!("Failed to record download to archive: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
let duration_ms = start_time.elapsed().as_millis() as u64;
|
||||
|
||||
Ok(DownloadResult {
|
||||
|
||||
@@ -9,6 +9,7 @@ pub mod logging;
|
||||
pub mod extractor;
|
||||
pub mod download;
|
||||
pub mod postprocess;
|
||||
pub mod archive;
|
||||
|
||||
// Re-export extractor types for library users
|
||||
pub use extractor::{
|
||||
@@ -42,6 +43,11 @@ pub use postprocess::{
|
||||
PostProcessError,
|
||||
};
|
||||
|
||||
// Re-export archive types for library users
|
||||
pub use archive::{
|
||||
DownloadArchive, SqliteArchive, ArchiveError,
|
||||
};
|
||||
|
||||
/// Version of the gallery-dl crate
|
||||
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||
|
||||
|
||||
Reference in New Issue
Block a user