diff --git a/.gitignore b/.gitignore index e068618c..b9558103 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ archive/ +.claude/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/src/auth/browser.rs b/src/auth/browser.rs index c77a53c9..c705dc79 100644 --- a/src/auth/browser.rs +++ b/src/auth/browser.rs @@ -1,8 +1,15 @@ -//! Browser cookie extraction for Firefox and Chrome +//! Browser cookie extraction for Firefox-based browsers and Chrome //! //! This module provides functionality to extract cookies directly from //! browser SQLite cookie databases, enabling seamless authentication //! without manual cookie file exports. +//! +//! Supported browsers: +//! - Firefox (all platforms) +//! - Zen Browser (Firefox-based) +//! - LibreWolf (Firefox-based) +//! - Waterfox (Firefox-based) +//! - Chrome / Chromium (all platforms, plaintext cookies only) use rusqlite::Connection; use std::collections::HashMap; @@ -60,6 +67,17 @@ fn get_home_dir() -> Result { .ok_or_else(|| BrowserError::Other("Could not determine home directory".to_string())) } +/// Get the APPDATA directory (Windows only, falls back to home) +fn get_appdata_dir() -> Result { + // Try APPDATA env var first (Windows) + if let Ok(appdata) = std::env::var("APPDATA") { + return Ok(PathBuf::from(appdata)); + } + // Fallback: use dirs crate + dirs::config_dir() + .ok_or_else(|| BrowserError::Other("Could not determine config directory".to_string())) +} + /// Copy a file to a temporary location to avoid locking issues fn copy_to_temp>(path: P) -> Result { let temp_file = tempfile::NamedTempFile::new()?; @@ -67,32 +85,26 @@ fn copy_to_temp>(path: P) -> Result Result { - let home = get_home_dir()?; - let firefox_dir = home.join(".mozilla").join("firefox"); +/// Searches each candidate for subdirectories containing `cookies.sqlite`. +/// Prefers `default-release` profiles, then `default` profiles. +fn find_profile_in_dirs(candidate_dirs: &[PathBuf]) -> Result { + for dir in candidate_dirs { + if !dir.exists() { + continue; + } - if !firefox_dir.exists() { - return Err(BrowserError::ProfileNotFound(format!( - "Firefox directory not found: {:?}", - firefox_dir - ))); - } + let entries = match fs::read_dir(dir) { + Ok(e) => e, + Err(_) => continue, + }; - // Read directory entries - let entries = fs::read_dir(&firefox_dir).map_err(|e| BrowserError::Io(e))?; + let mut profile_dirs: Vec<(String, PathBuf)> = Vec::new(); - let mut profile_dirs: Vec<(String, PathBuf)> = Vec::new(); - - for entry in entries.flatten() { - let path = entry.path(); - if path.is_dir() { - // Check if this is a profile directory (contains cookies.sqlite) - let cookies_path = path.join("cookies.sqlite"); - if cookies_path.exists() { - // Get the profile name from the directory name + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() && path.join("cookies.sqlite").exists() { let name = path .file_name() .and_then(|n| n.to_str()) @@ -101,46 +113,112 @@ pub fn find_firefox_profile() -> Result { profile_dirs.push((name, path)); } } - } - if profile_dirs.is_empty() { - return Err(BrowserError::ProfileNotFound( - "No Firefox profiles with cookies found".to_string(), - )); - } - - // Prefer default-release profile, otherwise use first available - profile_dirs.sort_by(|a, b| { - let a_default = a.0.contains("default-release"); - let b_default = b.0.contains("default-release"); - match (a_default, b_default) { - (true, false) => std::cmp::Ordering::Less, - (false, true) => std::cmp::Ordering::Greater, - _ => std::cmp::Ordering::Equal, + if profile_dirs.is_empty() { + continue; } - }); - let selected = &profile_dirs[0].1; - log::info!("Found Firefox profile: {:?}", selected); - Ok(selected.clone()) + // Sort: prefer default-release > default > anything else + profile_dirs.sort_by(|a, b| { + fn rank(name: &str) -> u8 { + if name.contains("default-release") { 0 } + else if name.contains("default") { 1 } + else { 2 } + } + rank(&a.0).cmp(&rank(&b.0)) + }); + + let selected = &profile_dirs[0].1; + log::info!("Found browser profile: {:?}", selected); + return Ok(selected.clone()); + } + + Err(BrowserError::ProfileNotFound(format!( + "No profiles with cookies found. Searched: {:?}", + candidate_dirs + ))) } -/// Extract cookies from Firefox profile -/// -/// # Arguments -/// * `domain` - Optional domain to filter cookies (e.g., ".twitter.com") -/// -/// Returns a HashMap of cookie name -> value -pub fn extract_firefox_cookies( +/// Get candidate profile directories for Firefox +fn firefox_profile_dirs() -> Vec { + let mut dirs = Vec::new(); + if let Ok(home) = get_home_dir() { + // Windows + if let Ok(appdata) = get_appdata_dir() { + dirs.push(appdata.join("Mozilla").join("Firefox").join("Profiles")); + } + // Linux + dirs.push(home.join(".mozilla").join("firefox")); + // macOS + dirs.push(home.join("Library").join("Application Support").join("Firefox").join("Profiles")); + } + dirs +} + +/// Get candidate profile directories for Zen Browser +fn zen_profile_dirs() -> Vec { + let mut dirs = Vec::new(); + if let Ok(home) = get_home_dir() { + // Windows + if let Ok(appdata) = get_appdata_dir() { + dirs.push(appdata.join("zen").join("Profiles")); + } + // Linux + dirs.push(home.join(".zen")); + // macOS + dirs.push(home.join("Library").join("Application Support").join("zen").join("Profiles")); + } + dirs +} + +/// Get candidate profile directories for LibreWolf +fn librewolf_profile_dirs() -> Vec { + let mut dirs = Vec::new(); + if let Ok(home) = get_home_dir() { + if let Ok(appdata) = get_appdata_dir() { + dirs.push(appdata.join("librewolf").join("Profiles")); + } + dirs.push(home.join(".librewolf")); + dirs.push(home.join("Library").join("Application Support").join("librewolf").join("Profiles")); + } + dirs +} + +/// Get candidate profile directories for Waterfox +fn waterfox_profile_dirs() -> Vec { + let mut dirs = Vec::new(); + if let Ok(home) = get_home_dir() { + if let Ok(appdata) = get_appdata_dir() { + dirs.push(appdata.join("Waterfox").join("Profiles")); + } + dirs.push(home.join(".waterfox")); + dirs.push(home.join("Library").join("Application Support").join("Waterfox").join("Profiles")); + } + dirs +} + +/// Find a Firefox profile directory (searches standard Firefox locations) +pub fn find_firefox_profile() -> Result { + find_profile_in_dirs(&firefox_profile_dirs()) +} + +/// Find a Zen Browser profile directory +pub fn find_zen_profile() -> Result { + find_profile_in_dirs(&zen_profile_dirs()) +} + +/// Extract cookies from a Firefox-compatible SQLite database (moz_cookies table) +fn extract_moz_cookies( + profile_dir: &PathBuf, domain: Option<&str>, + browser_name: &str, ) -> Result, BrowserError> { - let profile_dir = find_firefox_profile()?; let cookies_path = profile_dir.join("cookies.sqlite"); if !cookies_path.exists() { return Err(BrowserError::DatabaseNotFound(format!( - "Firefox cookies database not found: {:?}", - cookies_path + "{} cookies database not found: {:?}", + browser_name, cookies_path ))); } @@ -150,7 +228,6 @@ pub fn extract_firefox_cookies( let cookies: HashMap = match domain { Some(d) => { - // Query with domain filter let pattern = format!("%{}", d); let mut stmt = conn.prepare("SELECT name, value FROM moz_cookies WHERE host LIKE ?")?; let mut cookies = HashMap::new(); @@ -163,7 +240,6 @@ pub fn extract_firefox_cookies( cookies } None => { - // Get all cookies let mut stmt = conn.prepare("SELECT name, value FROM moz_cookies")?; let mut cookies = HashMap::new(); let rows = stmt.query_map([], |row| { @@ -176,24 +252,61 @@ pub fn extract_firefox_cookies( } }; - log::info!("Extracted {} cookies from Firefox", cookies.len()); + log::info!("Extracted {} cookies from {}", cookies.len(), browser_name); Ok(cookies) } +/// Extract cookies from Firefox +pub fn extract_firefox_cookies( + domain: Option<&str>, +) -> Result, BrowserError> { + let profile_dir = find_profile_in_dirs(&firefox_profile_dirs())?; + extract_moz_cookies(&profile_dir, domain, "Firefox") +} + +/// Extract cookies from Zen Browser +pub fn extract_zen_cookies( + domain: Option<&str>, +) -> Result, BrowserError> { + let profile_dir = find_profile_in_dirs(&zen_profile_dirs())?; + extract_moz_cookies(&profile_dir, domain, "Zen Browser") +} + +/// Extract cookies from LibreWolf +pub fn extract_librewolf_cookies( + domain: Option<&str>, +) -> Result, BrowserError> { + let profile_dir = find_profile_in_dirs(&librewolf_profile_dirs())?; + extract_moz_cookies(&profile_dir, domain, "LibreWolf") +} + +/// Extract cookies from Waterfox +pub fn extract_waterfox_cookies( + domain: Option<&str>, +) -> Result, BrowserError> { + let profile_dir = find_profile_in_dirs(&waterfox_profile_dirs())?; + extract_moz_cookies(&profile_dir, domain, "Waterfox") +} + /// Find the Chrome profile directory -/// -/// Searches in ~/.config/google-chrome/ for Default profile pub fn find_chrome_profile() -> Result { let home = get_home_dir()?; - // Try different possible Chrome config locations - let possible_paths = vec![ - home.join(".config").join("google-chrome"), - home.join(".config").join("chromium"), - home.join("Library") - .join("Application Support") - .join("Google Chrome"), - ]; + let mut possible_paths = Vec::new(); + + // Windows + if let Ok(local_appdata) = std::env::var("LOCALAPPDATA") { + let local = PathBuf::from(local_appdata); + possible_paths.push(local.join("Google").join("Chrome").join("User Data")); + possible_paths.push(local.join("Chromium").join("User Data")); + } + + // Linux + possible_paths.push(home.join(".config").join("google-chrome")); + possible_paths.push(home.join(".config").join("chromium")); + + // macOS + possible_paths.push(home.join("Library").join("Application Support").join("Google Chrome")); for chrome_dir in possible_paths { if chrome_dir.exists() { @@ -215,13 +328,8 @@ pub fn find_chrome_profile() -> Result { /// Extract cookies from Chrome profile /// -/// Note: Chrome stores some cookies with encrypted values using the OS keyring. -/// This function extracts plaintext cookies and logs a warning for encrypted ones. -/// -/// # Arguments -/// * `domain` - Optional domain to filter cookies (e.g., ".twitter.com") -/// -/// Returns a HashMap of cookie name -> value +/// Note: Chrome encrypts most cookies using the OS keyring. +/// This function extracts plaintext cookies and skips encrypted ones. pub fn extract_chrome_cookies( domain: Option<&str>, ) -> Result, BrowserError> { @@ -235,14 +343,12 @@ pub fn extract_chrome_cookies( ))); } - // Copy to temp to avoid locking let temp_path = copy_to_temp(&cookies_path)?; let conn = Connection::open(&temp_path)?; let mut cookies = HashMap::new(); let mut encrypted_count = 0; - // Chrome uses different table schema - check for encrypted_value column let has_encrypted = conn .query_row( "SELECT COUNT(*) FROM pragma_table_info('cookies') WHERE name='encrypted_value'", @@ -252,14 +358,13 @@ pub fn extract_chrome_cookies( .unwrap_or(0) > 0; - // Always select with domain filter (use wildcard for all) let domain_pattern = match domain { Some(d) => format!("%{}%", d), None => "%".to_string(), }; let mut stmt = - conn.prepare("SELECT name, value, encrypted_value FROM cookies WHERE host LIKE ?")?; + conn.prepare("SELECT name, value, encrypted_value FROM cookies WHERE host_key LIKE ?")?; let rows = stmt.query_map([domain_pattern], |row| { let name: String = row.get(0)?; @@ -271,12 +376,11 @@ pub fn extract_chrome_cookies( for row_result in rows { let (name, value, encrypted) = row_result?; - // Check if cookie has encrypted value if has_encrypted { if let Some(enc) = encrypted { if !enc.is_empty() { encrypted_count += 1; - continue; // Skip encrypted cookies + continue; } } } @@ -287,7 +391,7 @@ pub fn extract_chrome_cookies( if encrypted_count > 0 { log::warn!( "Skipped {} encrypted Chrome cookies (OS keyring required). \ - Run with --cookies-file for encrypted cookies.", + Use --cookies with a cookies.txt file instead.", encrypted_count ); } @@ -300,32 +404,22 @@ pub fn extract_chrome_cookies( Ok(cookies) } -/// Extract cookies from a browser +/// Extract cookies from a browser by name /// -/// # Arguments -/// * `browser` - Browser name: "firefox", "chrome", or "chromium" -/// * `domain` - Optional domain to filter cookies -/// -/// # Example -/// ```no_run -/// use gallery_dl::auth::extract_browser_cookies; -/// -/// // Get all cookies from Firefox -/// let cookies = extract_browser_cookies("firefox", None).unwrap(); -/// -/// // Get Twitter cookies from Chrome -/// let twitter_cookies = extract_browser_cookies("chrome", Some("twitter.com")).unwrap(); -/// ``` +/// Supported browsers: firefox, zen, librewolf, waterfox, chrome, chromium pub fn extract_browser_cookies( browser: &str, domain: Option<&str>, ) -> Result, BrowserError> { match browser.to_lowercase().as_str() { "firefox" | "ff" => extract_firefox_cookies(domain), + "zen" | "zen-browser" => extract_zen_cookies(domain), + "librewolf" => extract_librewolf_cookies(domain), + "waterfox" => extract_waterfox_cookies(domain), "chrome" | "google-chrome" => extract_chrome_cookies(domain), "chromium" => extract_chrome_cookies(domain), _ => Err(BrowserError::Other(format!( - "Unsupported browser: {}. Supported: firefox, chrome, chromium", + "Unsupported browser: '{}'. Supported: firefox, zen, librewolf, waterfox, chrome, chromium", browser ))), } @@ -334,7 +428,6 @@ pub fn extract_browser_cookies( #[cfg(test)] mod tests { use super::*; - use std::env; #[test] fn test_get_home_dir() { @@ -350,25 +443,26 @@ mod tests { #[test] fn test_extract_browser_cookies_case_insensitive() { - // Should not error, just return empty or ProfileNotFound let result = extract_browser_cookies("FIREFOX", None); - // Either works or profile not found (acceptable in test env) + assert!(result.is_ok() || matches!(result, Err(BrowserError::ProfileNotFound(_)))); + } + + #[test] + fn test_zen_browser_recognized() { + let result = extract_browser_cookies("zen", None); + // Should be ProfileNotFound (not unsupported browser error) assert!(result.is_ok() || matches!(result, Err(BrowserError::ProfileNotFound(_)))); } #[test] fn test_firefox_cookies_with_domain() { - // Should not error even if profile not found in test env let result = extract_firefox_cookies(Some("twitter.com")); - // Either works or profile not found (acceptable in test env) assert!(result.is_ok() || matches!(result, Err(BrowserError::ProfileNotFound(_)))); } #[test] fn test_chrome_cookies_with_domain() { - // Should not error even if profile not found in test env let result = extract_chrome_cookies(Some("twitter.com")); - // Either works or profile not found (acceptable in test env) assert!(result.is_ok() || matches!(result, Err(BrowserError::ProfileNotFound(_)))); } } diff --git a/src/extractor/extractors/mod.rs b/src/extractor/extractors/mod.rs index c0c20466..51aee4df 100644 --- a/src/extractor/extractors/mod.rs +++ b/src/extractor/extractors/mod.rs @@ -246,6 +246,7 @@ mod rawkuma; mod readcomiconline; mod schalenetwork; mod shimmie2; +mod snapchat; mod tungsten; mod weebdex; mod xenforo; @@ -861,6 +862,10 @@ pub fn register_all() { // Register SimplyHentai extractors (simplyhentai.com) register(simplyhentai::SimplyhentaiExtractor::new().expect("Failed to create SimplyHentai extractor")); + // Register Snapchat extractors (snapchat.com) + register(snapchat::SnapchatSpotlightExtractor::new()); + register(snapchat::SnapchatProfileExtractor::new()); + // Register Skeb extractors (skeb.jp) register(skeb::SkebExtractor::new()); diff --git a/src/extractor/extractors/snapchat.rs b/src/extractor/extractors/snapchat.rs new file mode 100644 index 00000000..ec218ef0 --- /dev/null +++ b/src/extractor/extractors/snapchat.rs @@ -0,0 +1,523 @@ +//! Snapchat extractor implementation +//! +//! Supports public Snapchat content: +//! - Spotlight videos: `snapchat.com/spotlight/{id}` +//! - Public profiles/stories: `snapchat.com/add/{username}` +//! +//! Data is extracted from the `__NEXT_DATA__` JSON embedded in the page HTML +//! (Next.js server-side rendering). No authentication required for public content. + +use async_trait::async_trait; +use regex::Regex; +use serde_json::Value; +use std::collections::HashMap; + +use crate::extractor::{Extractor, ExtractorError, ExtractorMatch, Message}; + +const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"; + +/// Extract the `__NEXT_DATA__` JSON blob from a Snapchat page +fn extract_next_data(html: &str) -> Option { + let re = Regex::new(r#"]*>(.*?)"#).ok()?; + let caps = re.captures(html)?; + let json_str = caps.get(1)?.as_str(); + serde_json::from_str(json_str).ok() +} + +/// Recursively search a JSON value for all occurrences of a key +fn find_all_values<'a>(json: &'a Value, key: &str) -> Vec<&'a Value> { + let mut results = Vec::new(); + match json { + Value::Object(map) => { + for (k, v) in map { + if k == key { + results.push(v); + } + results.extend(find_all_values(v, key)); + } + } + Value::Array(arr) => { + for v in arr { + results.extend(find_all_values(v, key)); + } + } + _ => {} + } + results +} + +/// Extract a filename from a CDN URL +/// e.g. `https://cf-st.sc-cdn.net/d/ABCDEF.27.IRZXSOY?mo=...` -> `ABCDEF.mp4` +fn cdn_filename(url: &str) -> Option { + let parsed = url::Url::parse(url).ok()?; + let path = parsed.path(); + // Path is like /d/HASH.27.IRZXSOY or /TYPE/HASH.27.IRZXSOY + let segment = path.rsplit('/').next()?; + // Take everything before the first dot as the hash ID + let hash = segment.split('.').next()?; + if hash.is_empty() { + return None; + } + Some(format!("{}.mp4", hash)) +} + +// ============================================================================ +// SnapchatSpotlightExtractor — single spotlight video +// ============================================================================ + +#[derive(Clone)] +pub struct SnapchatSpotlightExtractor { + pattern: Regex, + spotlight_id: Option, +} + +impl SnapchatSpotlightExtractor { + pub fn new() -> Self { + Self { + pattern: Regex::new( + r"(?:https?://)?(?:www\.)?snapchat\.com/spotlight/([A-Za-z0-9_-]+)" + ).expect("Failed to compile Snapchat spotlight pattern"), + spotlight_id: None, + } + } + + fn create_client(&self) -> Result { + reqwest::Client::builder() + .user_agent(USER_AGENT) + .timeout(std::time::Duration::from_secs(30)) + .redirect(reqwest::redirect::Policy::limited(10)) + .build() + .map_err(|e| ExtractorError::ConfigError(e.to_string())) + } + + async fn fetch_page(&self, url: &str) -> Result { + let client = self.create_client()?; + let response = client.get(url).send().await + .map_err(ExtractorError::RequestFailed)?; + + let status = response.status(); + if status.as_u16() == 404 { + return Err(ExtractorError::NotFound(format!("Spotlight not found: {}", url))); + } + if !status.is_success() { + return Err(ExtractorError::HttpError(format!("HTTP {}", status.as_u16()))); + } + + response.text().await + .map_err(|e| ExtractorError::ParseError(e.to_string())) + } + + fn extract_videos_from_next_data(&self, data: &Value) -> Vec<(String, HashMap)> { + let mut videos = Vec::new(); + + // Look for contentUrl fields (direct video URLs) + let content_urls = find_all_values(data, "contentUrl"); + for url_val in &content_urls { + if let Some(url) = url_val.as_str() { + if url.contains("sc-cdn.net") || url.contains(".mp4") { + let mut meta = HashMap::new(); + videos.push((url.to_string(), meta)); + } + } + } + + // Also check mediaUrl as a fallback + if videos.is_empty() { + let media_urls = find_all_values(data, "mediaUrl"); + for url_val in &media_urls { + if let Some(url) = url_val.as_str() { + if url.contains("sc-cdn.net") || url.contains(".mp4") { + let meta = HashMap::new(); + videos.push((url.to_string(), meta)); + } + } + } + } + + // Enrich with metadata from the same JSON tree + // Try to find upload date, view count, creator info + let upload_dates = find_all_values(data, "uploadDateMs"); + let view_counts = find_all_values(data, "viewCount"); + let usernames = find_all_values(data, "username"); + let display_names = find_all_values(data, "displayName"); + + for (i, (_url, meta)) in videos.iter_mut().enumerate() { + if let Some(date_val) = upload_dates.get(i) { + meta.insert("upload_date".to_string(), (*date_val).clone()); + } else if let Some(date_val) = upload_dates.first() { + meta.insert("upload_date".to_string(), (*date_val).clone()); + } + + if let Some(count_val) = view_counts.get(i) { + meta.insert("view_count".to_string(), (*count_val).clone()); + } else if let Some(count_val) = view_counts.first() { + meta.insert("view_count".to_string(), (*count_val).clone()); + } + + if let Some(user_val) = usernames.get(i).or(usernames.first()) { + meta.insert("username".to_string(), (*user_val).clone()); + } + + if let Some(name_val) = display_names.get(i).or(display_names.first()) { + meta.insert("display_name".to_string(), (*name_val).clone()); + } + } + + videos + } +} + +#[async_trait] +impl Extractor for SnapchatSpotlightExtractor { + fn category(&self) -> &str { "snapchat" } + fn subcategory(&self) -> &str { "spotlight" } + fn root(&self) -> &str { "https://www.snapchat.com" } + fn pattern(&self) -> &Regex { &self.pattern } + + fn clone_extractor(&self) -> Box { + Box::new(self.clone()) + } + + async fn initialize(&mut self, m: ExtractorMatch) -> Result<(), ExtractorError> { + if let Some(caps) = self.pattern.captures(&m.url) { + self.spotlight_id = caps.get(1).map(|m| m.as_str().to_string()); + } + Ok(()) + } + + async fn items(&mut self) -> Result, ExtractorError> { + let spotlight_id = self.spotlight_id.as_ref() + .ok_or_else(|| ExtractorError::NotInitialized("spotlight_id not set".to_string()))?; + + let url = format!("https://www.snapchat.com/spotlight/{}", spotlight_id); + log::info!("Fetching Snapchat spotlight: {}", url); + + let html = self.fetch_page(&url).await?; + + let next_data = extract_next_data(&html) + .ok_or_else(|| ExtractorError::ParseError( + "Could not find __NEXT_DATA__ in page HTML".to_string() + ))?; + + let videos = self.extract_videos_from_next_data(&next_data); + + if videos.is_empty() { + return Err(ExtractorError::ParseError( + "No video URLs found in spotlight data".to_string() + )); + } + + let mut messages = Vec::new(); + + // Directory message + let creator = videos.first() + .and_then(|(_, meta)| meta.get("username")) + .and_then(|v| v.as_str()) + .unwrap_or("unknown"); + + let dir_msg = Message::directory("") + .with_metadata("category", serde_json::json!("snapchat")) + .with_metadata("subcategory", serde_json::json!("spotlight")) + .with_metadata("title", serde_json::json!(format!("spotlight_{}", spotlight_id))) + .with_metadata("creator", serde_json::json!(creator)); + messages.push(dir_msg); + + for (video_url, meta) in &videos { + let filename = cdn_filename(video_url) + .unwrap_or_else(|| format!("{}.mp4", spotlight_id)); + + let mut msg = Message::url(video_url) + .with_filename(&filename); + + for (key, val) in meta { + msg = msg.with_metadata(key, val.clone()); + } + + messages.push(msg); + } + + log::info!("Found {} video(s) in spotlight {}", videos.len(), spotlight_id); + Ok(messages) + } +} + +// ============================================================================ +// SnapchatProfileExtractor — public profile stories +// ============================================================================ + +#[derive(Clone)] +pub struct SnapchatProfileExtractor { + pattern: Regex, + username: Option, +} + +impl SnapchatProfileExtractor { + pub fn new() -> Self { + Self { + pattern: Regex::new( + r"(?:https?://)?(?:www\.)?snapchat\.com/add/([A-Za-z0-9._-]+)" + ).expect("Failed to compile Snapchat profile pattern"), + username: None, + } + } + + fn create_client(&self) -> Result { + reqwest::Client::builder() + .user_agent(USER_AGENT) + .timeout(std::time::Duration::from_secs(30)) + .redirect(reqwest::redirect::Policy::limited(10)) + .build() + .map_err(|e| ExtractorError::ConfigError(e.to_string())) + } + + async fn fetch_page(&self, url: &str) -> Result { + let client = self.create_client()?; + let response = client.get(url).send().await + .map_err(ExtractorError::RequestFailed)?; + + let status = response.status(); + if status.as_u16() == 404 { + return Err(ExtractorError::NotFound(format!("Profile not found: {}", url))); + } + if !status.is_success() { + return Err(ExtractorError::HttpError(format!("HTTP {}", status.as_u16()))); + } + + response.text().await + .map_err(|e| ExtractorError::ParseError(e.to_string())) + } +} + +#[async_trait] +impl Extractor for SnapchatProfileExtractor { + fn category(&self) -> &str { "snapchat" } + fn subcategory(&self) -> &str { "profile" } + fn root(&self) -> &str { "https://www.snapchat.com" } + fn pattern(&self) -> &Regex { &self.pattern } + + fn clone_extractor(&self) -> Box { + Box::new(self.clone()) + } + + async fn initialize(&mut self, m: ExtractorMatch) -> Result<(), ExtractorError> { + if let Some(caps) = self.pattern.captures(&m.url) { + self.username = caps.get(1).map(|m| m.as_str().to_string()); + } + Ok(()) + } + + async fn items(&mut self) -> Result, ExtractorError> { + let username = self.username.as_ref() + .ok_or_else(|| ExtractorError::NotInitialized("username not set".to_string()))?; + + let url = format!("https://www.snapchat.com/add/{}", username); + log::info!("Fetching Snapchat profile: {}", url); + + let html = self.fetch_page(&url).await?; + + let next_data = extract_next_data(&html) + .ok_or_else(|| ExtractorError::ParseError( + "Could not find __NEXT_DATA__ in page HTML. Profile may be private or empty.".to_string() + ))?; + + // Extract all media URLs from the profile data + let mut media_urls: Vec = Vec::new(); + + // Look for contentUrl (videos) + for val in find_all_values(&next_data, "contentUrl") { + if let Some(url) = val.as_str() { + if url.contains("sc-cdn.net") || url.contains(".mp4") { + media_urls.push(url.to_string()); + } + } + } + + // Look for mediaUrl (alternate/additional media) + for val in find_all_values(&next_data, "mediaUrl") { + if let Some(url) = val.as_str() { + if (url.contains("sc-cdn.net") || url.contains(".mp4") || url.contains(".jpg") || url.contains(".png")) + && !media_urls.contains(&url.to_string()) + { + media_urls.push(url.to_string()); + } + } + } + + // Look for snapMediaUrl (story media) + for val in find_all_values(&next_data, "snapMediaUrl") { + if let Some(url) = val.as_str() { + if !media_urls.contains(&url.to_string()) { + media_urls.push(url.to_string()); + } + } + } + + // Look for thumbnailUrl (image previews) + for val in find_all_values(&next_data, "thumbnailUrl") { + if let Some(url) = val.as_str() { + if !media_urls.contains(&url.to_string()) { + media_urls.push(url.to_string()); + } + } + } + + let mut messages = Vec::new(); + + // Directory message + let display_name = find_all_values(&next_data, "displayName") + .first() + .and_then(|v| v.as_str()) + .unwrap_or(username.as_str()) + .to_string(); + + let dir_msg = Message::directory("") + .with_metadata("category", serde_json::json!("snapchat")) + .with_metadata("subcategory", serde_json::json!("profile")) + .with_metadata("title", serde_json::json!(&display_name)) + .with_metadata("username", serde_json::json!(username)); + messages.push(dir_msg); + + if media_urls.is_empty() { + log::warn!("No media found on profile {}. It may be private or have no public stories.", username); + return Ok(messages); + } + + for (i, media_url) in media_urls.iter().enumerate() { + let filename = cdn_filename(media_url) + .unwrap_or_else(|| { + let ext = if media_url.contains(".mp4") { "mp4" } + else if media_url.contains(".jpg") || media_url.contains(".jpeg") { "jpg" } + else if media_url.contains(".png") { "png" } + else { "mp4" }; + format!("{}_{:03}.{}", username, i + 1, ext) + }); + + let msg = Message::url(media_url) + .with_filename(&filename) + .with_metadata("username", serde_json::json!(username)) + .with_metadata("num", serde_json::json!(i + 1)); + + messages.push(msg); + } + + log::info!("Found {} media item(s) on profile {}", media_urls.len(), username); + Ok(messages) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_spotlight_pattern() { + let ext = SnapchatSpotlightExtractor::new(); + assert!(ext.pattern.is_match("https://www.snapchat.com/spotlight/ABC123_def")); + assert!(ext.pattern.is_match("https://snapchat.com/spotlight/ABC123")); + assert!(ext.pattern.is_match("http://www.snapchat.com/spotlight/test-id_123")); + assert!(!ext.pattern.is_match("https://snapchat.com/add/username")); + assert!(!ext.pattern.is_match("https://snapchat.com/")); + } + + #[test] + fn test_profile_pattern() { + let ext = SnapchatProfileExtractor::new(); + assert!(ext.pattern.is_match("https://www.snapchat.com/add/john_doe")); + assert!(ext.pattern.is_match("https://snapchat.com/add/user.name")); + assert!(ext.pattern.is_match("http://www.snapchat.com/add/test-user")); + assert!(!ext.pattern.is_match("https://snapchat.com/spotlight/ABC123")); + assert!(!ext.pattern.is_match("https://snapchat.com/")); + } + + #[test] + fn test_extract_next_data() { + let html = r#""#; + let data = extract_next_data(html); + assert!(data.is_some()); + let data = data.unwrap(); + let urls = find_all_values(&data, "contentUrl"); + assert_eq!(urls.len(), 1); + assert_eq!(urls[0].as_str().unwrap(), "https://cf-st.sc-cdn.net/d/HASH123.27.IRZXSOY?mo=test"); + } + + #[test] + fn test_extract_next_data_missing() { + let html = r#"No next data here"#; + assert!(extract_next_data(html).is_none()); + } + + #[test] + fn test_find_all_values() { + let json: Value = serde_json::json!({ + "a": { + "contentUrl": "url1", + "nested": { + "contentUrl": "url2" + } + }, + "b": [ + {"contentUrl": "url3"}, + {"other": "ignored"} + ] + }); + let urls = find_all_values(&json, "contentUrl"); + assert_eq!(urls.len(), 3); + } + + #[test] + fn test_cdn_filename() { + assert_eq!( + cdn_filename("https://cf-st.sc-cdn.net/d/ABCDEF.27.IRZXSOY?mo=test&uc=46"), + Some("ABCDEF.mp4".to_string()) + ); + assert_eq!( + cdn_filename("https://bolt-gcdn.sc-cdn.net/video/HASH123.27.IRZXSOY?mo=test"), + Some("HASH123.mp4".to_string()) + ); + } + + #[test] + fn test_cdn_filename_no_hash() { + // Should still extract something from normal URLs + assert!(cdn_filename("https://example.com/some/path/file.mp4").is_some()); + } + + #[test] + fn test_spotlight_extract_videos() { + let ext = SnapchatSpotlightExtractor::new(); + let data: Value = serde_json::json!({ + "props": { + "pageProps": { + "story": { + "contentUrl": "https://cf-st.sc-cdn.net/d/ABC.27.IRZXSOY?mo=test", + "uploadDateMs": 1700000000000_u64, + "viewCount": 50000, + "username": "testuser", + "displayName": "Test User" + } + } + } + }); + let videos = ext.extract_videos_from_next_data(&data); + assert_eq!(videos.len(), 1); + assert!(videos[0].0.contains("sc-cdn.net")); + assert!(videos[0].1.contains_key("username")); + } + + #[test] + fn test_spotlight_mediaurl_fallback() { + let ext = SnapchatSpotlightExtractor::new(); + let data: Value = serde_json::json!({ + "props": { + "pageProps": { + "media": { + "mediaUrl": "https://cf-st.sc-cdn.net/d/FALLBACK.27.IRZXSOY?mo=x" + } + } + } + }); + let videos = ext.extract_videos_from_next_data(&data); + assert_eq!(videos.len(), 1); + assert!(videos[0].0.contains("FALLBACK")); + } +} diff --git a/src/extractor/extractors/xenforo.rs b/src/extractor/extractors/xenforo.rs index b407d054..28a7fbcb 100644 --- a/src/extractor/extractors/xenforo.rs +++ b/src/extractor/extractors/xenforo.rs @@ -1,40 +1,462 @@ //! XenForo extractor implementation //! //! Supports XenForo forums (simpcity.cr, nudostar.com/forum, etc.) +//! Extracts images and videos from thread posts with pagination support. use async_trait::async_trait; use regex::Regex; +use std::collections::{HashMap, HashSet}; use crate::extractor::{ - Extractor, ExtractorError, ExtractorMatch, HttpClient, Message, + Extractor, ExtractorError, ExtractorMatch, Message, MessageKind, }; -pub struct XenforoPostExtractor { - pattern: Regex, - category: String, - subcategory: String, - root_url: String, - post_id: Option, - client: HttpClient, +const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"; + +/// Known XenForo domains and their root URLs +fn root_for_domain(domain: &str) -> String { + match domain { + "simpcity.cr" | "simpcity.su" => format!("https://{}", domain), + "nudostar.com/forum" => "https://nudostar.com/forum".to_string(), + "allthefallen.moe/forum" => "https://allthefallen.moe/forum".to_string(), + "celebforum.to" => "https://celebforum.to".to_string(), + "forums.socialmediagirls.com" => "https://forums.socialmediagirls.com".to_string(), + _ => format!("https://{}", domain), + } } +/// Build a cookie header string from a HashMap +fn cookie_header(cookies: &HashMap) -> String { + cookies.iter() + .map(|(k, v)| format!("{}={}", k, v)) + .collect::>() + .join("; ") +} + +/// Extract media URLs from HTML content. +/// +/// Finds all media by matching multiple patterns: +/// - `` — inline images +/// - `