feat(05-03): implement SQLite download archive for duplicate detection

- Added rusqlite dependency with bundled feature
- Created src/archive/mod.rs with DownloadArchive trait and SqliteArchive
- Added --download-archive CLI option for archive database path
- Added --download-archive-skip-duplicates flag with default path
- Integrated archive checking in DownloadManager before download
- Records successful downloads to archive after completion
- All 129 tests pass
This commit is contained in:
2026-02-16 09:25:23 +01:00
parent 2eeb8f7d6b
commit 2117d5d6fe
6 changed files with 370 additions and 1 deletions

87
Cargo.lock generated
View File

@@ -674,6 +674,18 @@ dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "fallible-iterator"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649"
[[package]]
name = "fallible-streaming-iterator"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
[[package]]
name = "fastrand"
version = "2.3.0"
@@ -709,6 +721,12 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
[[package]]
name = "foldhash"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
[[package]]
name = "form_urlencoded"
version = "1.2.2"
@@ -837,6 +855,7 @@ dependencies = [
"once_cell",
"regex",
"reqwest",
"rusqlite",
"scraper",
"serde",
"serde_json",
@@ -937,7 +956,7 @@ version = "0.15.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
dependencies = [
"foldhash",
"foldhash 0.1.5",
]
[[package]]
@@ -945,6 +964,18 @@ name = "hashbrown"
version = "0.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
dependencies = [
"foldhash 0.2.0",
]
[[package]]
name = "hashlink"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea0b22561a9c04a7cb1a302c013e0259cd3b4bb619f145b32f72b8b4bcbed230"
dependencies = [
"hashbrown 0.16.1",
]
[[package]]
name = "heck"
@@ -1361,6 +1392,17 @@ dependencies = [
"libc",
]
[[package]]
name = "libsqlite3-sys"
version = "0.36.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95b4103cffefa72eb8428cb6b47d6627161e51c2739fc5e3b734584157bc642a"
dependencies = [
"cc",
"pkg-config",
"vcpkg",
]
[[package]]
name = "linux-raw-sys"
version = "0.11.0"
@@ -1909,6 +1951,31 @@ dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "rsqlite-vfs"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8a1f2315036ef6b1fbacd1972e8ee7688030b0a2121edfc2a6550febd41574d"
dependencies = [
"hashbrown 0.16.1",
"thiserror 2.0.18",
]
[[package]]
name = "rusqlite"
version = "0.38.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1c93dd1c9683b438c392c492109cb702b8090b2bfc8fed6f6e4eb4523f17af3"
dependencies = [
"bitflags",
"fallible-iterator",
"fallible-streaming-iterator",
"hashlink",
"libsqlite3-sys",
"smallvec",
"sqlite-wasm-rs",
]
[[package]]
name = "rustc-hash"
version = "2.1.1"
@@ -2257,6 +2324,18 @@ dependencies = [
"windows-sys 0.60.2",
]
[[package]]
name = "sqlite-wasm-rs"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f4206ed3a67690b9c29b77d728f6acc3ce78f16bf846d83c94f76400320181b"
dependencies = [
"cc",
"js-sys",
"rsqlite-vfs",
"wasm-bindgen",
]
[[package]]
name = "stable_deref_trait"
version = "1.2.1"
@@ -2739,6 +2818,12 @@ version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "vcpkg"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "version_check"
version = "0.9.5"

View File

@@ -36,6 +36,7 @@ futures = "0.3"
zip = { version = "8.0.0", features = ["deflate"] }
walkdir = "2.5.0"
chrono = { version = "0.4.43", features = ["serde"] }
rusqlite = { version = "0.38.0", features = ["bundled"] }
[profile.release]
opt-level = 3

211
src/archive/mod.rs Normal file
View File

@@ -0,0 +1,211 @@
//! Archive module for tracking downloaded files
//!
//! Provides DownloadArchive trait and SqliteArchive implementation
//! for detecting and skipping duplicate downloads.
use rusqlite::{params, Connection};
use std::path::Path;
use std::sync::Mutex;
use thiserror::Error;
/// Errors that can occur during archive operations
#[derive(Debug, Error)]
pub enum ArchiveError {
#[error("Database error: {0}")]
DatabaseError(#[from] rusqlite::Error),
#[error("IO error: {0}")]
IoError(#[from] std::io::Error),
#[error("Lock error")]
LockError,
}
/// Trait for archive backends that track downloaded files
pub trait DownloadArchive: Send + Sync {
/// Check if a file is already in the archive
fn contains(&self, url: &str, filename: &str) -> bool;
/// Add a file to the archive after successful download
fn add(
&self,
url: &str,
filename: &str,
hash: Option<&str>,
size: Option<u64>,
extractor: Option<&str>,
) -> Result<(), ArchiveError>;
}
/// SQLite-based archive for tracking downloaded files
pub struct SqliteArchive {
conn: Mutex<Connection>,
}
impl SqliteArchive {
/// Create a new SqliteArchive, creating the database if it doesn't exist
pub fn new(path: &Path) -> Result<Self, ArchiveError> {
// Create parent directories if they don't exist
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent)?;
}
let conn = Connection::open(path)?;
let archive = Self {
conn: Mutex::new(conn),
};
archive.init_schema()?;
Ok(archive)
}
/// Initialize the database schema
fn init_schema(&self) -> Result<(), ArchiveError> {
let conn = self.conn.lock().map_err(|_| ArchiveError::LockError)?;
conn.execute(
"CREATE TABLE IF NOT EXISTS archive (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL,
filename TEXT NOT NULL,
hash TEXT,
size INTEGER,
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
extractor TEXT,
UNIQUE(url, filename)
)",
[],
)?;
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_archive_hash ON archive(hash)",
[],
)?;
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_archive_url ON archive(url)",
[],
)?;
Ok(())
}
}
impl std::fmt::Debug for SqliteArchive {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SqliteArchive").finish()
}
}
impl DownloadArchive for SqliteArchive {
/// Check if a file (identified by URL + filename) is already in the archive
fn contains(&self, url: &str, filename: &str) -> bool {
let conn = match self.conn.lock() {
Ok(c) => c,
Err(_) => return false,
};
let result = conn.query_row(
"SELECT 1 FROM archive WHERE url = ?1 AND filename = ?2",
params![url, filename],
|_row| Ok(()),
);
match result {
Ok(_) => true,
Err(rusqlite::Error::QueryReturnedNoRows) => false,
Err(e) => {
// Log the error but don't fail - treat as not found
log::warn!("Archive check error: {}", e);
false
}
}
}
/// Add a file to the archive
fn add(
&self,
url: &str,
filename: &str,
hash: Option<&str>,
size: Option<u64>,
extractor: Option<&str>,
) -> Result<(), ArchiveError> {
let conn = self.conn.lock().map_err(|_| ArchiveError::LockError)?;
conn.execute(
"INSERT OR IGNORE INTO archive (url, filename, hash, size, extractor) VALUES (?1, ?2, ?3, ?4, ?5)",
params![url, filename, hash, size.map(|s| s as i64), extractor],
)?;
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
#[test]
fn test_archive_creation() {
let temp_dir = TempDir::new().unwrap();
let db_path = temp_dir.path().join("archive.db");
let archive = SqliteArchive::new(&db_path);
assert!(archive.is_ok());
}
#[test]
fn test_archive_contains_empty() {
let temp_dir = TempDir::new().unwrap();
let db_path = temp_dir.path().join("archive.db");
let archive = SqliteArchive::new(&db_path).unwrap();
// Empty archive should not contain any files
assert!(!archive.contains("https://example.com/file.jpg", "file.jpg"));
}
#[test]
fn test_archive_add_and_contains() {
let temp_dir = TempDir::new().unwrap();
let db_path = temp_dir.path().join("archive.db");
let archive = SqliteArchive::new(&db_path).unwrap();
// Add a file
let result = archive.add(
"https://example.com/file.jpg",
"file.jpg",
Some("abc123"),
Some(1024),
Some("test"),
);
assert!(result.is_ok());
// Now it should be contained
assert!(archive.contains("https://example.com/file.jpg", "file.jpg"));
// Different URL/filename should not be contained
assert!(!archive.contains("https://example.com/other.jpg", "other.jpg"));
}
#[test]
fn test_archive_duplicate_add() {
let temp_dir = TempDir::new().unwrap();
let db_path = temp_dir.path().join("archive.db");
let archive = SqliteArchive::new(&db_path).unwrap();
// Add same file twice (should not fail due to INSERT OR IGNORE)
archive
.add("https://example.com/file.jpg", "file.jpg", None, None, None)
.unwrap();
archive
.add("https://example.com/file.jpg", "file.jpg", None, None, None)
.unwrap();
// Should still be in archive
assert!(archive.contains("https://example.com/file.jpg", "file.jpg"));
}
}

View File

@@ -111,6 +111,17 @@ pub struct Args {
/// Use {} as placeholder for file path (e.g., --exec "scan {}")
#[arg(long = "exec", value_name = "COMMAND", num_args = 1..)]
pub exec: Option<Vec<String>>,
// ===== Archive Options =====
/// Path to archive database for tracking downloaded files
/// Files already in the archive will be skipped
#[arg(long = "download-archive", value_name = "PATH")]
pub download_archive: Option<PathBuf>,
/// Enable archive tracking with default path (~/.gallery-dl/archive.db)
/// Shorthand for --download-archive with default path
#[arg(long = "download-archive-skip-duplicates")]
pub download_archive_skip_duplicates: bool,
}
impl Args {
@@ -188,6 +199,29 @@ pub fn parse_exec_config(args: Vec<String>) -> Option<crate::postprocess::ExecCo
Some(crate::postprocess::ExecConfig::new(command).with_args(exec_args))
}
/// Parse archive path from CLI argument
///
/// Creates archive directory if it doesn't exist
/// Returns path to archive.db file
pub fn parse_archive_path(path: Option<PathBuf>) -> Option<PathBuf> {
path.map(|p| {
// If path is a directory, use archive.db inside it
if p.is_dir() {
p.join("archive.db")
} else {
p
}
})
}
/// Get default archive path (~/.gallery-dl/archive.db)
pub fn default_archive_path() -> PathBuf {
let mut path = dirs::data_local_dir().unwrap_or_else(|| PathBuf::from("."));
path.push("gallery-dl");
path.push("archive.db");
path
}
/// Configuration file type
#[derive(Debug, Clone, ValueEnum)]
pub enum ConfigType {

View File

@@ -10,12 +10,14 @@ pub mod worker;
pub use templates::{PathTemplate, TemplateError};
pub use worker::{DownloadItem, DownloadWorker, DownloadWorkerResult, download_batch};
use crate::archive::{DownloadArchive, SqliteArchive};
use progress::DownloadProgress;
use resume::{get_partial_path, get_resume_offset, ResumeError};
use reqwest::header::{HeaderMap, RANGE};
use reqwest::Client;
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use thiserror::Error;
use tokio::fs::File;
use tokio::io::AsyncWriteExt;
@@ -60,6 +62,8 @@ pub struct DownloadOptions {
pub resume: bool,
/// File filter options
pub filter: Option<FileFilter>,
/// Archive for tracking downloaded files
pub archive: Option<Arc<SqliteArchive>>,
}
/// File filter for skipping downloads based on size or type
@@ -172,6 +176,7 @@ impl DownloadOptions {
headers: HeaderMap::new(),
resume: true,
filter: None,
archive: None,
}
}
@@ -204,6 +209,12 @@ impl DownloadOptions {
self.filter = Some(filter);
self
}
/// Set archive for duplicate detection
pub fn archive(mut self, archive: Arc<SqliteArchive>) -> Self {
self.archive = Some(archive);
self
}
}
/// Result of a successful download
@@ -242,6 +253,20 @@ impl DownloadManager {
pub async fn download(&self, options: DownloadOptions) -> Result<DownloadResult, DownloadError> {
let start_time = std::time::Instant::now();
// Get the filename from the destination path
let filename = options.destination
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("unknown");
// Check archive for duplicate before downloading
if let Some(ref archive) = options.archive {
if archive.contains(&options.url, filename) {
println!("Skipping {} - already in archive", filename);
return Err(DownloadError::Filtered("already in archive".to_string()));
}
}
// Get the .part file path
let partial_path = get_partial_path(&options.destination);
@@ -374,6 +399,13 @@ impl DownloadManager {
// Rename .part file to final destination
tokio::fs::rename(&partial_path, &options.destination).await?;
// Record to archive after successful download
if let Some(ref archive) = options.archive {
if let Err(e) = archive.add(&options.url, filename, None, Some(downloaded), None) {
log::warn!("Failed to record download to archive: {}", e);
}
}
let duration_ms = start_time.elapsed().as_millis() as u64;
Ok(DownloadResult {

View File

@@ -9,6 +9,7 @@ pub mod logging;
pub mod extractor;
pub mod download;
pub mod postprocess;
pub mod archive;
// Re-export extractor types for library users
pub use extractor::{
@@ -42,6 +43,11 @@ pub use postprocess::{
PostProcessError,
};
// Re-export archive types for library users
pub use archive::{
DownloadArchive, SqliteArchive, ArchiveError,
};
/// Version of the gallery-dl crate
pub const VERSION: &str = env!("CARGO_PKG_VERSION");