feat: add Snapchat extractor, improve browser auth and XenForo support

- Add new Snapchat story extractor with spotlight and user story support
- Expand browser cookie extraction to support Zen Browser and multi-platform profiles
- Significantly enhance XenForo extractor with gallery, media, and attachment support
- Add APPDATA-based profile discovery for Windows browsers
- Update main.rs with new extractor wiring and improved CLI handling

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-25 16:29:16 +01:00
parent e4dae6de12
commit ca342ee3a3
6 changed files with 1441 additions and 243 deletions

1
.gitignore vendored
View File

@@ -1,4 +1,5 @@
archive/
.claude/
# Byte-compiled / optimized / DLL files
__pycache__/

View File

@@ -1,8 +1,15 @@
//! Browser cookie extraction for Firefox and Chrome
//! Browser cookie extraction for Firefox-based browsers and Chrome
//!
//! This module provides functionality to extract cookies directly from
//! browser SQLite cookie databases, enabling seamless authentication
//! without manual cookie file exports.
//!
//! Supported browsers:
//! - Firefox (all platforms)
//! - Zen Browser (Firefox-based)
//! - LibreWolf (Firefox-based)
//! - Waterfox (Firefox-based)
//! - Chrome / Chromium (all platforms, plaintext cookies only)
use rusqlite::Connection;
use std::collections::HashMap;
@@ -60,6 +67,17 @@ fn get_home_dir() -> Result<PathBuf, BrowserError> {
.ok_or_else(|| BrowserError::Other("Could not determine home directory".to_string()))
}
/// Get the APPDATA directory (Windows only, falls back to home)
fn get_appdata_dir() -> Result<PathBuf, BrowserError> {
// Try APPDATA env var first (Windows)
if let Ok(appdata) = std::env::var("APPDATA") {
return Ok(PathBuf::from(appdata));
}
// Fallback: use dirs crate
dirs::config_dir()
.ok_or_else(|| BrowserError::Other("Could not determine config directory".to_string()))
}
/// Copy a file to a temporary location to avoid locking issues
fn copy_to_temp<P: AsRef<std::path::Path>>(path: P) -> Result<tempfile::TempPath, BrowserError> {
let temp_file = tempfile::NamedTempFile::new()?;
@@ -67,32 +85,26 @@ fn copy_to_temp<P: AsRef<std::path::Path>>(path: P) -> Result<tempfile::TempPath
Ok(temp_file.into_temp_path())
}
/// Find the Firefox profile directory
/// Find a profile directory from a list of candidate parent directories.
///
/// Searches in ~/.mozilla/firefox/ for profiles
pub fn find_firefox_profile() -> Result<PathBuf, BrowserError> {
let home = get_home_dir()?;
let firefox_dir = home.join(".mozilla").join("firefox");
/// Searches each candidate for subdirectories containing `cookies.sqlite`.
/// Prefers `default-release` profiles, then `default` profiles.
fn find_profile_in_dirs(candidate_dirs: &[PathBuf]) -> Result<PathBuf, BrowserError> {
for dir in candidate_dirs {
if !dir.exists() {
continue;
}
if !firefox_dir.exists() {
return Err(BrowserError::ProfileNotFound(format!(
"Firefox directory not found: {:?}",
firefox_dir
)));
}
let entries = match fs::read_dir(dir) {
Ok(e) => e,
Err(_) => continue,
};
// Read directory entries
let entries = fs::read_dir(&firefox_dir).map_err(|e| BrowserError::Io(e))?;
let mut profile_dirs: Vec<(String, PathBuf)> = Vec::new();
let mut profile_dirs: Vec<(String, PathBuf)> = Vec::new();
for entry in entries.flatten() {
let path = entry.path();
if path.is_dir() {
// Check if this is a profile directory (contains cookies.sqlite)
let cookies_path = path.join("cookies.sqlite");
if cookies_path.exists() {
// Get the profile name from the directory name
for entry in entries.flatten() {
let path = entry.path();
if path.is_dir() && path.join("cookies.sqlite").exists() {
let name = path
.file_name()
.and_then(|n| n.to_str())
@@ -101,46 +113,112 @@ pub fn find_firefox_profile() -> Result<PathBuf, BrowserError> {
profile_dirs.push((name, path));
}
}
}
if profile_dirs.is_empty() {
return Err(BrowserError::ProfileNotFound(
"No Firefox profiles with cookies found".to_string(),
));
}
// Prefer default-release profile, otherwise use first available
profile_dirs.sort_by(|a, b| {
let a_default = a.0.contains("default-release");
let b_default = b.0.contains("default-release");
match (a_default, b_default) {
(true, false) => std::cmp::Ordering::Less,
(false, true) => std::cmp::Ordering::Greater,
_ => std::cmp::Ordering::Equal,
if profile_dirs.is_empty() {
continue;
}
});
let selected = &profile_dirs[0].1;
log::info!("Found Firefox profile: {:?}", selected);
Ok(selected.clone())
// Sort: prefer default-release > default > anything else
profile_dirs.sort_by(|a, b| {
fn rank(name: &str) -> u8 {
if name.contains("default-release") { 0 }
else if name.contains("default") { 1 }
else { 2 }
}
rank(&a.0).cmp(&rank(&b.0))
});
let selected = &profile_dirs[0].1;
log::info!("Found browser profile: {:?}", selected);
return Ok(selected.clone());
}
Err(BrowserError::ProfileNotFound(format!(
"No profiles with cookies found. Searched: {:?}",
candidate_dirs
)))
}
/// Extract cookies from Firefox profile
///
/// # Arguments
/// * `domain` - Optional domain to filter cookies (e.g., ".twitter.com")
///
/// Returns a HashMap of cookie name -> value
pub fn extract_firefox_cookies(
/// Get candidate profile directories for Firefox
fn firefox_profile_dirs() -> Vec<PathBuf> {
let mut dirs = Vec::new();
if let Ok(home) = get_home_dir() {
// Windows
if let Ok(appdata) = get_appdata_dir() {
dirs.push(appdata.join("Mozilla").join("Firefox").join("Profiles"));
}
// Linux
dirs.push(home.join(".mozilla").join("firefox"));
// macOS
dirs.push(home.join("Library").join("Application Support").join("Firefox").join("Profiles"));
}
dirs
}
/// Get candidate profile directories for Zen Browser
fn zen_profile_dirs() -> Vec<PathBuf> {
let mut dirs = Vec::new();
if let Ok(home) = get_home_dir() {
// Windows
if let Ok(appdata) = get_appdata_dir() {
dirs.push(appdata.join("zen").join("Profiles"));
}
// Linux
dirs.push(home.join(".zen"));
// macOS
dirs.push(home.join("Library").join("Application Support").join("zen").join("Profiles"));
}
dirs
}
/// Get candidate profile directories for LibreWolf
fn librewolf_profile_dirs() -> Vec<PathBuf> {
let mut dirs = Vec::new();
if let Ok(home) = get_home_dir() {
if let Ok(appdata) = get_appdata_dir() {
dirs.push(appdata.join("librewolf").join("Profiles"));
}
dirs.push(home.join(".librewolf"));
dirs.push(home.join("Library").join("Application Support").join("librewolf").join("Profiles"));
}
dirs
}
/// Get candidate profile directories for Waterfox
fn waterfox_profile_dirs() -> Vec<PathBuf> {
let mut dirs = Vec::new();
if let Ok(home) = get_home_dir() {
if let Ok(appdata) = get_appdata_dir() {
dirs.push(appdata.join("Waterfox").join("Profiles"));
}
dirs.push(home.join(".waterfox"));
dirs.push(home.join("Library").join("Application Support").join("Waterfox").join("Profiles"));
}
dirs
}
/// Find a Firefox profile directory (searches standard Firefox locations)
pub fn find_firefox_profile() -> Result<PathBuf, BrowserError> {
find_profile_in_dirs(&firefox_profile_dirs())
}
/// Find a Zen Browser profile directory
pub fn find_zen_profile() -> Result<PathBuf, BrowserError> {
find_profile_in_dirs(&zen_profile_dirs())
}
/// Extract cookies from a Firefox-compatible SQLite database (moz_cookies table)
fn extract_moz_cookies(
profile_dir: &PathBuf,
domain: Option<&str>,
browser_name: &str,
) -> Result<HashMap<String, String>, BrowserError> {
let profile_dir = find_firefox_profile()?;
let cookies_path = profile_dir.join("cookies.sqlite");
if !cookies_path.exists() {
return Err(BrowserError::DatabaseNotFound(format!(
"Firefox cookies database not found: {:?}",
cookies_path
"{} cookies database not found: {:?}",
browser_name, cookies_path
)));
}
@@ -150,7 +228,6 @@ pub fn extract_firefox_cookies(
let cookies: HashMap<String, String> = match domain {
Some(d) => {
// Query with domain filter
let pattern = format!("%{}", d);
let mut stmt = conn.prepare("SELECT name, value FROM moz_cookies WHERE host LIKE ?")?;
let mut cookies = HashMap::new();
@@ -163,7 +240,6 @@ pub fn extract_firefox_cookies(
cookies
}
None => {
// Get all cookies
let mut stmt = conn.prepare("SELECT name, value FROM moz_cookies")?;
let mut cookies = HashMap::new();
let rows = stmt.query_map([], |row| {
@@ -176,24 +252,61 @@ pub fn extract_firefox_cookies(
}
};
log::info!("Extracted {} cookies from Firefox", cookies.len());
log::info!("Extracted {} cookies from {}", cookies.len(), browser_name);
Ok(cookies)
}
/// Extract cookies from Firefox
pub fn extract_firefox_cookies(
domain: Option<&str>,
) -> Result<HashMap<String, String>, BrowserError> {
let profile_dir = find_profile_in_dirs(&firefox_profile_dirs())?;
extract_moz_cookies(&profile_dir, domain, "Firefox")
}
/// Extract cookies from Zen Browser
pub fn extract_zen_cookies(
domain: Option<&str>,
) -> Result<HashMap<String, String>, BrowserError> {
let profile_dir = find_profile_in_dirs(&zen_profile_dirs())?;
extract_moz_cookies(&profile_dir, domain, "Zen Browser")
}
/// Extract cookies from LibreWolf
pub fn extract_librewolf_cookies(
domain: Option<&str>,
) -> Result<HashMap<String, String>, BrowserError> {
let profile_dir = find_profile_in_dirs(&librewolf_profile_dirs())?;
extract_moz_cookies(&profile_dir, domain, "LibreWolf")
}
/// Extract cookies from Waterfox
pub fn extract_waterfox_cookies(
domain: Option<&str>,
) -> Result<HashMap<String, String>, BrowserError> {
let profile_dir = find_profile_in_dirs(&waterfox_profile_dirs())?;
extract_moz_cookies(&profile_dir, domain, "Waterfox")
}
/// Find the Chrome profile directory
///
/// Searches in ~/.config/google-chrome/ for Default profile
pub fn find_chrome_profile() -> Result<PathBuf, BrowserError> {
let home = get_home_dir()?;
// Try different possible Chrome config locations
let possible_paths = vec![
home.join(".config").join("google-chrome"),
home.join(".config").join("chromium"),
home.join("Library")
.join("Application Support")
.join("Google Chrome"),
];
let mut possible_paths = Vec::new();
// Windows
if let Ok(local_appdata) = std::env::var("LOCALAPPDATA") {
let local = PathBuf::from(local_appdata);
possible_paths.push(local.join("Google").join("Chrome").join("User Data"));
possible_paths.push(local.join("Chromium").join("User Data"));
}
// Linux
possible_paths.push(home.join(".config").join("google-chrome"));
possible_paths.push(home.join(".config").join("chromium"));
// macOS
possible_paths.push(home.join("Library").join("Application Support").join("Google Chrome"));
for chrome_dir in possible_paths {
if chrome_dir.exists() {
@@ -215,13 +328,8 @@ pub fn find_chrome_profile() -> Result<PathBuf, BrowserError> {
/// Extract cookies from Chrome profile
///
/// Note: Chrome stores some cookies with encrypted values using the OS keyring.
/// This function extracts plaintext cookies and logs a warning for encrypted ones.
///
/// # Arguments
/// * `domain` - Optional domain to filter cookies (e.g., ".twitter.com")
///
/// Returns a HashMap of cookie name -> value
/// Note: Chrome encrypts most cookies using the OS keyring.
/// This function extracts plaintext cookies and skips encrypted ones.
pub fn extract_chrome_cookies(
domain: Option<&str>,
) -> Result<HashMap<String, String>, BrowserError> {
@@ -235,14 +343,12 @@ pub fn extract_chrome_cookies(
)));
}
// Copy to temp to avoid locking
let temp_path = copy_to_temp(&cookies_path)?;
let conn = Connection::open(&temp_path)?;
let mut cookies = HashMap::new();
let mut encrypted_count = 0;
// Chrome uses different table schema - check for encrypted_value column
let has_encrypted = conn
.query_row(
"SELECT COUNT(*) FROM pragma_table_info('cookies') WHERE name='encrypted_value'",
@@ -252,14 +358,13 @@ pub fn extract_chrome_cookies(
.unwrap_or(0)
> 0;
// Always select with domain filter (use wildcard for all)
let domain_pattern = match domain {
Some(d) => format!("%{}%", d),
None => "%".to_string(),
};
let mut stmt =
conn.prepare("SELECT name, value, encrypted_value FROM cookies WHERE host LIKE ?")?;
conn.prepare("SELECT name, value, encrypted_value FROM cookies WHERE host_key LIKE ?")?;
let rows = stmt.query_map([domain_pattern], |row| {
let name: String = row.get(0)?;
@@ -271,12 +376,11 @@ pub fn extract_chrome_cookies(
for row_result in rows {
let (name, value, encrypted) = row_result?;
// Check if cookie has encrypted value
if has_encrypted {
if let Some(enc) = encrypted {
if !enc.is_empty() {
encrypted_count += 1;
continue; // Skip encrypted cookies
continue;
}
}
}
@@ -287,7 +391,7 @@ pub fn extract_chrome_cookies(
if encrypted_count > 0 {
log::warn!(
"Skipped {} encrypted Chrome cookies (OS keyring required). \
Run with --cookies-file for encrypted cookies.",
Use --cookies with a cookies.txt file instead.",
encrypted_count
);
}
@@ -300,32 +404,22 @@ pub fn extract_chrome_cookies(
Ok(cookies)
}
/// Extract cookies from a browser
/// Extract cookies from a browser by name
///
/// # Arguments
/// * `browser` - Browser name: "firefox", "chrome", or "chromium"
/// * `domain` - Optional domain to filter cookies
///
/// # Example
/// ```no_run
/// use gallery_dl::auth::extract_browser_cookies;
///
/// // Get all cookies from Firefox
/// let cookies = extract_browser_cookies("firefox", None).unwrap();
///
/// // Get Twitter cookies from Chrome
/// let twitter_cookies = extract_browser_cookies("chrome", Some("twitter.com")).unwrap();
/// ```
/// Supported browsers: firefox, zen, librewolf, waterfox, chrome, chromium
pub fn extract_browser_cookies(
browser: &str,
domain: Option<&str>,
) -> Result<HashMap<String, String>, BrowserError> {
match browser.to_lowercase().as_str() {
"firefox" | "ff" => extract_firefox_cookies(domain),
"zen" | "zen-browser" => extract_zen_cookies(domain),
"librewolf" => extract_librewolf_cookies(domain),
"waterfox" => extract_waterfox_cookies(domain),
"chrome" | "google-chrome" => extract_chrome_cookies(domain),
"chromium" => extract_chrome_cookies(domain),
_ => Err(BrowserError::Other(format!(
"Unsupported browser: {}. Supported: firefox, chrome, chromium",
"Unsupported browser: '{}'. Supported: firefox, zen, librewolf, waterfox, chrome, chromium",
browser
))),
}
@@ -334,7 +428,6 @@ pub fn extract_browser_cookies(
#[cfg(test)]
mod tests {
use super::*;
use std::env;
#[test]
fn test_get_home_dir() {
@@ -350,25 +443,26 @@ mod tests {
#[test]
fn test_extract_browser_cookies_case_insensitive() {
// Should not error, just return empty or ProfileNotFound
let result = extract_browser_cookies("FIREFOX", None);
// Either works or profile not found (acceptable in test env)
assert!(result.is_ok() || matches!(result, Err(BrowserError::ProfileNotFound(_))));
}
#[test]
fn test_zen_browser_recognized() {
let result = extract_browser_cookies("zen", None);
// Should be ProfileNotFound (not unsupported browser error)
assert!(result.is_ok() || matches!(result, Err(BrowserError::ProfileNotFound(_))));
}
#[test]
fn test_firefox_cookies_with_domain() {
// Should not error even if profile not found in test env
let result = extract_firefox_cookies(Some("twitter.com"));
// Either works or profile not found (acceptable in test env)
assert!(result.is_ok() || matches!(result, Err(BrowserError::ProfileNotFound(_))));
}
#[test]
fn test_chrome_cookies_with_domain() {
// Should not error even if profile not found in test env
let result = extract_chrome_cookies(Some("twitter.com"));
// Either works or profile not found (acceptable in test env)
assert!(result.is_ok() || matches!(result, Err(BrowserError::ProfileNotFound(_))));
}
}

View File

@@ -246,6 +246,7 @@ mod rawkuma;
mod readcomiconline;
mod schalenetwork;
mod shimmie2;
mod snapchat;
mod tungsten;
mod weebdex;
mod xenforo;
@@ -861,6 +862,10 @@ pub fn register_all() {
// Register SimplyHentai extractors (simplyhentai.com)
register(simplyhentai::SimplyhentaiExtractor::new().expect("Failed to create SimplyHentai extractor"));
// Register Snapchat extractors (snapchat.com)
register(snapchat::SnapchatSpotlightExtractor::new());
register(snapchat::SnapchatProfileExtractor::new());
// Register Skeb extractors (skeb.jp)
register(skeb::SkebExtractor::new());

View File

@@ -0,0 +1,523 @@
//! Snapchat extractor implementation
//!
//! Supports public Snapchat content:
//! - Spotlight videos: `snapchat.com/spotlight/{id}`
//! - Public profiles/stories: `snapchat.com/add/{username}`
//!
//! Data is extracted from the `__NEXT_DATA__` JSON embedded in the page HTML
//! (Next.js server-side rendering). No authentication required for public content.
use async_trait::async_trait;
use regex::Regex;
use serde_json::Value;
use std::collections::HashMap;
use crate::extractor::{Extractor, ExtractorError, ExtractorMatch, Message};
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
/// Extract the `__NEXT_DATA__` JSON blob from a Snapchat page
fn extract_next_data(html: &str) -> Option<Value> {
let re = Regex::new(r#"<script\s+id="__NEXT_DATA__"\s+type="application/json"[^>]*>(.*?)</script>"#).ok()?;
let caps = re.captures(html)?;
let json_str = caps.get(1)?.as_str();
serde_json::from_str(json_str).ok()
}
/// Recursively search a JSON value for all occurrences of a key
fn find_all_values<'a>(json: &'a Value, key: &str) -> Vec<&'a Value> {
let mut results = Vec::new();
match json {
Value::Object(map) => {
for (k, v) in map {
if k == key {
results.push(v);
}
results.extend(find_all_values(v, key));
}
}
Value::Array(arr) => {
for v in arr {
results.extend(find_all_values(v, key));
}
}
_ => {}
}
results
}
/// Extract a filename from a CDN URL
/// e.g. `https://cf-st.sc-cdn.net/d/ABCDEF.27.IRZXSOY?mo=...` -> `ABCDEF.mp4`
fn cdn_filename(url: &str) -> Option<String> {
let parsed = url::Url::parse(url).ok()?;
let path = parsed.path();
// Path is like /d/HASH.27.IRZXSOY or /TYPE/HASH.27.IRZXSOY
let segment = path.rsplit('/').next()?;
// Take everything before the first dot as the hash ID
let hash = segment.split('.').next()?;
if hash.is_empty() {
return None;
}
Some(format!("{}.mp4", hash))
}
// ============================================================================
// SnapchatSpotlightExtractor — single spotlight video
// ============================================================================
#[derive(Clone)]
pub struct SnapchatSpotlightExtractor {
pattern: Regex,
spotlight_id: Option<String>,
}
impl SnapchatSpotlightExtractor {
pub fn new() -> Self {
Self {
pattern: Regex::new(
r"(?:https?://)?(?:www\.)?snapchat\.com/spotlight/([A-Za-z0-9_-]+)"
).expect("Failed to compile Snapchat spotlight pattern"),
spotlight_id: None,
}
}
fn create_client(&self) -> Result<reqwest::Client, ExtractorError> {
reqwest::Client::builder()
.user_agent(USER_AGENT)
.timeout(std::time::Duration::from_secs(30))
.redirect(reqwest::redirect::Policy::limited(10))
.build()
.map_err(|e| ExtractorError::ConfigError(e.to_string()))
}
async fn fetch_page(&self, url: &str) -> Result<String, ExtractorError> {
let client = self.create_client()?;
let response = client.get(url).send().await
.map_err(ExtractorError::RequestFailed)?;
let status = response.status();
if status.as_u16() == 404 {
return Err(ExtractorError::NotFound(format!("Spotlight not found: {}", url)));
}
if !status.is_success() {
return Err(ExtractorError::HttpError(format!("HTTP {}", status.as_u16())));
}
response.text().await
.map_err(|e| ExtractorError::ParseError(e.to_string()))
}
fn extract_videos_from_next_data(&self, data: &Value) -> Vec<(String, HashMap<String, Value>)> {
let mut videos = Vec::new();
// Look for contentUrl fields (direct video URLs)
let content_urls = find_all_values(data, "contentUrl");
for url_val in &content_urls {
if let Some(url) = url_val.as_str() {
if url.contains("sc-cdn.net") || url.contains(".mp4") {
let mut meta = HashMap::new();
videos.push((url.to_string(), meta));
}
}
}
// Also check mediaUrl as a fallback
if videos.is_empty() {
let media_urls = find_all_values(data, "mediaUrl");
for url_val in &media_urls {
if let Some(url) = url_val.as_str() {
if url.contains("sc-cdn.net") || url.contains(".mp4") {
let meta = HashMap::new();
videos.push((url.to_string(), meta));
}
}
}
}
// Enrich with metadata from the same JSON tree
// Try to find upload date, view count, creator info
let upload_dates = find_all_values(data, "uploadDateMs");
let view_counts = find_all_values(data, "viewCount");
let usernames = find_all_values(data, "username");
let display_names = find_all_values(data, "displayName");
for (i, (_url, meta)) in videos.iter_mut().enumerate() {
if let Some(date_val) = upload_dates.get(i) {
meta.insert("upload_date".to_string(), (*date_val).clone());
} else if let Some(date_val) = upload_dates.first() {
meta.insert("upload_date".to_string(), (*date_val).clone());
}
if let Some(count_val) = view_counts.get(i) {
meta.insert("view_count".to_string(), (*count_val).clone());
} else if let Some(count_val) = view_counts.first() {
meta.insert("view_count".to_string(), (*count_val).clone());
}
if let Some(user_val) = usernames.get(i).or(usernames.first()) {
meta.insert("username".to_string(), (*user_val).clone());
}
if let Some(name_val) = display_names.get(i).or(display_names.first()) {
meta.insert("display_name".to_string(), (*name_val).clone());
}
}
videos
}
}
#[async_trait]
impl Extractor for SnapchatSpotlightExtractor {
fn category(&self) -> &str { "snapchat" }
fn subcategory(&self) -> &str { "spotlight" }
fn root(&self) -> &str { "https://www.snapchat.com" }
fn pattern(&self) -> &Regex { &self.pattern }
fn clone_extractor(&self) -> Box<dyn Extractor> {
Box::new(self.clone())
}
async fn initialize(&mut self, m: ExtractorMatch) -> Result<(), ExtractorError> {
if let Some(caps) = self.pattern.captures(&m.url) {
self.spotlight_id = caps.get(1).map(|m| m.as_str().to_string());
}
Ok(())
}
async fn items(&mut self) -> Result<Vec<Message>, ExtractorError> {
let spotlight_id = self.spotlight_id.as_ref()
.ok_or_else(|| ExtractorError::NotInitialized("spotlight_id not set".to_string()))?;
let url = format!("https://www.snapchat.com/spotlight/{}", spotlight_id);
log::info!("Fetching Snapchat spotlight: {}", url);
let html = self.fetch_page(&url).await?;
let next_data = extract_next_data(&html)
.ok_or_else(|| ExtractorError::ParseError(
"Could not find __NEXT_DATA__ in page HTML".to_string()
))?;
let videos = self.extract_videos_from_next_data(&next_data);
if videos.is_empty() {
return Err(ExtractorError::ParseError(
"No video URLs found in spotlight data".to_string()
));
}
let mut messages = Vec::new();
// Directory message
let creator = videos.first()
.and_then(|(_, meta)| meta.get("username"))
.and_then(|v| v.as_str())
.unwrap_or("unknown");
let dir_msg = Message::directory("")
.with_metadata("category", serde_json::json!("snapchat"))
.with_metadata("subcategory", serde_json::json!("spotlight"))
.with_metadata("title", serde_json::json!(format!("spotlight_{}", spotlight_id)))
.with_metadata("creator", serde_json::json!(creator));
messages.push(dir_msg);
for (video_url, meta) in &videos {
let filename = cdn_filename(video_url)
.unwrap_or_else(|| format!("{}.mp4", spotlight_id));
let mut msg = Message::url(video_url)
.with_filename(&filename);
for (key, val) in meta {
msg = msg.with_metadata(key, val.clone());
}
messages.push(msg);
}
log::info!("Found {} video(s) in spotlight {}", videos.len(), spotlight_id);
Ok(messages)
}
}
// ============================================================================
// SnapchatProfileExtractor — public profile stories
// ============================================================================
#[derive(Clone)]
pub struct SnapchatProfileExtractor {
pattern: Regex,
username: Option<String>,
}
impl SnapchatProfileExtractor {
pub fn new() -> Self {
Self {
pattern: Regex::new(
r"(?:https?://)?(?:www\.)?snapchat\.com/add/([A-Za-z0-9._-]+)"
).expect("Failed to compile Snapchat profile pattern"),
username: None,
}
}
fn create_client(&self) -> Result<reqwest::Client, ExtractorError> {
reqwest::Client::builder()
.user_agent(USER_AGENT)
.timeout(std::time::Duration::from_secs(30))
.redirect(reqwest::redirect::Policy::limited(10))
.build()
.map_err(|e| ExtractorError::ConfigError(e.to_string()))
}
async fn fetch_page(&self, url: &str) -> Result<String, ExtractorError> {
let client = self.create_client()?;
let response = client.get(url).send().await
.map_err(ExtractorError::RequestFailed)?;
let status = response.status();
if status.as_u16() == 404 {
return Err(ExtractorError::NotFound(format!("Profile not found: {}", url)));
}
if !status.is_success() {
return Err(ExtractorError::HttpError(format!("HTTP {}", status.as_u16())));
}
response.text().await
.map_err(|e| ExtractorError::ParseError(e.to_string()))
}
}
#[async_trait]
impl Extractor for SnapchatProfileExtractor {
fn category(&self) -> &str { "snapchat" }
fn subcategory(&self) -> &str { "profile" }
fn root(&self) -> &str { "https://www.snapchat.com" }
fn pattern(&self) -> &Regex { &self.pattern }
fn clone_extractor(&self) -> Box<dyn Extractor> {
Box::new(self.clone())
}
async fn initialize(&mut self, m: ExtractorMatch) -> Result<(), ExtractorError> {
if let Some(caps) = self.pattern.captures(&m.url) {
self.username = caps.get(1).map(|m| m.as_str().to_string());
}
Ok(())
}
async fn items(&mut self) -> Result<Vec<Message>, ExtractorError> {
let username = self.username.as_ref()
.ok_or_else(|| ExtractorError::NotInitialized("username not set".to_string()))?;
let url = format!("https://www.snapchat.com/add/{}", username);
log::info!("Fetching Snapchat profile: {}", url);
let html = self.fetch_page(&url).await?;
let next_data = extract_next_data(&html)
.ok_or_else(|| ExtractorError::ParseError(
"Could not find __NEXT_DATA__ in page HTML. Profile may be private or empty.".to_string()
))?;
// Extract all media URLs from the profile data
let mut media_urls: Vec<String> = Vec::new();
// Look for contentUrl (videos)
for val in find_all_values(&next_data, "contentUrl") {
if let Some(url) = val.as_str() {
if url.contains("sc-cdn.net") || url.contains(".mp4") {
media_urls.push(url.to_string());
}
}
}
// Look for mediaUrl (alternate/additional media)
for val in find_all_values(&next_data, "mediaUrl") {
if let Some(url) = val.as_str() {
if (url.contains("sc-cdn.net") || url.contains(".mp4") || url.contains(".jpg") || url.contains(".png"))
&& !media_urls.contains(&url.to_string())
{
media_urls.push(url.to_string());
}
}
}
// Look for snapMediaUrl (story media)
for val in find_all_values(&next_data, "snapMediaUrl") {
if let Some(url) = val.as_str() {
if !media_urls.contains(&url.to_string()) {
media_urls.push(url.to_string());
}
}
}
// Look for thumbnailUrl (image previews)
for val in find_all_values(&next_data, "thumbnailUrl") {
if let Some(url) = val.as_str() {
if !media_urls.contains(&url.to_string()) {
media_urls.push(url.to_string());
}
}
}
let mut messages = Vec::new();
// Directory message
let display_name = find_all_values(&next_data, "displayName")
.first()
.and_then(|v| v.as_str())
.unwrap_or(username.as_str())
.to_string();
let dir_msg = Message::directory("")
.with_metadata("category", serde_json::json!("snapchat"))
.with_metadata("subcategory", serde_json::json!("profile"))
.with_metadata("title", serde_json::json!(&display_name))
.with_metadata("username", serde_json::json!(username));
messages.push(dir_msg);
if media_urls.is_empty() {
log::warn!("No media found on profile {}. It may be private or have no public stories.", username);
return Ok(messages);
}
for (i, media_url) in media_urls.iter().enumerate() {
let filename = cdn_filename(media_url)
.unwrap_or_else(|| {
let ext = if media_url.contains(".mp4") { "mp4" }
else if media_url.contains(".jpg") || media_url.contains(".jpeg") { "jpg" }
else if media_url.contains(".png") { "png" }
else { "mp4" };
format!("{}_{:03}.{}", username, i + 1, ext)
});
let msg = Message::url(media_url)
.with_filename(&filename)
.with_metadata("username", serde_json::json!(username))
.with_metadata("num", serde_json::json!(i + 1));
messages.push(msg);
}
log::info!("Found {} media item(s) on profile {}", media_urls.len(), username);
Ok(messages)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_spotlight_pattern() {
let ext = SnapchatSpotlightExtractor::new();
assert!(ext.pattern.is_match("https://www.snapchat.com/spotlight/ABC123_def"));
assert!(ext.pattern.is_match("https://snapchat.com/spotlight/ABC123"));
assert!(ext.pattern.is_match("http://www.snapchat.com/spotlight/test-id_123"));
assert!(!ext.pattern.is_match("https://snapchat.com/add/username"));
assert!(!ext.pattern.is_match("https://snapchat.com/"));
}
#[test]
fn test_profile_pattern() {
let ext = SnapchatProfileExtractor::new();
assert!(ext.pattern.is_match("https://www.snapchat.com/add/john_doe"));
assert!(ext.pattern.is_match("https://snapchat.com/add/user.name"));
assert!(ext.pattern.is_match("http://www.snapchat.com/add/test-user"));
assert!(!ext.pattern.is_match("https://snapchat.com/spotlight/ABC123"));
assert!(!ext.pattern.is_match("https://snapchat.com/"));
}
#[test]
fn test_extract_next_data() {
let html = r#"<html><head><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"story":{"contentUrl":"https://cf-st.sc-cdn.net/d/HASH123.27.IRZXSOY?mo=test"}}}}</script></head></html>"#;
let data = extract_next_data(html);
assert!(data.is_some());
let data = data.unwrap();
let urls = find_all_values(&data, "contentUrl");
assert_eq!(urls.len(), 1);
assert_eq!(urls[0].as_str().unwrap(), "https://cf-st.sc-cdn.net/d/HASH123.27.IRZXSOY?mo=test");
}
#[test]
fn test_extract_next_data_missing() {
let html = r#"<html><head></head><body>No next data here</body></html>"#;
assert!(extract_next_data(html).is_none());
}
#[test]
fn test_find_all_values() {
let json: Value = serde_json::json!({
"a": {
"contentUrl": "url1",
"nested": {
"contentUrl": "url2"
}
},
"b": [
{"contentUrl": "url3"},
{"other": "ignored"}
]
});
let urls = find_all_values(&json, "contentUrl");
assert_eq!(urls.len(), 3);
}
#[test]
fn test_cdn_filename() {
assert_eq!(
cdn_filename("https://cf-st.sc-cdn.net/d/ABCDEF.27.IRZXSOY?mo=test&uc=46"),
Some("ABCDEF.mp4".to_string())
);
assert_eq!(
cdn_filename("https://bolt-gcdn.sc-cdn.net/video/HASH123.27.IRZXSOY?mo=test"),
Some("HASH123.mp4".to_string())
);
}
#[test]
fn test_cdn_filename_no_hash() {
// Should still extract something from normal URLs
assert!(cdn_filename("https://example.com/some/path/file.mp4").is_some());
}
#[test]
fn test_spotlight_extract_videos() {
let ext = SnapchatSpotlightExtractor::new();
let data: Value = serde_json::json!({
"props": {
"pageProps": {
"story": {
"contentUrl": "https://cf-st.sc-cdn.net/d/ABC.27.IRZXSOY?mo=test",
"uploadDateMs": 1700000000000_u64,
"viewCount": 50000,
"username": "testuser",
"displayName": "Test User"
}
}
}
});
let videos = ext.extract_videos_from_next_data(&data);
assert_eq!(videos.len(), 1);
assert!(videos[0].0.contains("sc-cdn.net"));
assert!(videos[0].1.contains_key("username"));
}
#[test]
fn test_spotlight_mediaurl_fallback() {
let ext = SnapchatSpotlightExtractor::new();
let data: Value = serde_json::json!({
"props": {
"pageProps": {
"media": {
"mediaUrl": "https://cf-st.sc-cdn.net/d/FALLBACK.27.IRZXSOY?mo=x"
}
}
}
});
let videos = ext.extract_videos_from_next_data(&data);
assert_eq!(videos.len(), 1);
assert!(videos[0].0.contains("FALLBACK"));
}
}

View File

@@ -1,40 +1,462 @@
//! XenForo extractor implementation
//!
//! Supports XenForo forums (simpcity.cr, nudostar.com/forum, etc.)
//! Extracts images and videos from thread posts with pagination support.
use async_trait::async_trait;
use regex::Regex;
use std::collections::{HashMap, HashSet};
use crate::extractor::{
Extractor, ExtractorError, ExtractorMatch, HttpClient, Message,
Extractor, ExtractorError, ExtractorMatch, Message, MessageKind,
};
pub struct XenforoPostExtractor {
pattern: Regex,
category: String,
subcategory: String,
root_url: String,
post_id: Option<String>,
client: HttpClient,
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
/// Known XenForo domains and their root URLs
fn root_for_domain(domain: &str) -> String {
match domain {
"simpcity.cr" | "simpcity.su" => format!("https://{}", domain),
"nudostar.com/forum" => "https://nudostar.com/forum".to_string(),
"allthefallen.moe/forum" => "https://allthefallen.moe/forum".to_string(),
"celebforum.to" => "https://celebforum.to".to_string(),
"forums.socialmediagirls.com" => "https://forums.socialmediagirls.com".to_string(),
_ => format!("https://{}", domain),
}
}
/// Build a cookie header string from a HashMap
fn cookie_header(cookies: &HashMap<String, String>) -> String {
cookies.iter()
.map(|(k, v)| format!("{}={}", k, v))
.collect::<Vec<_>>()
.join("; ")
}
/// Extract media URLs from HTML content.
///
/// Finds all media by matching multiple patterns:
/// - `<img class="bbImage" src="...">` — inline images
/// - `<video src="...">` — inline videos
/// - `<a href=".../attachments/...">` — file attachments
/// - `<iframe src="...">` — embedded media
/// - `loadMedia(this, '...')` — lazy-loaded embeds
fn extract_media_from_html(html: &str, root_url: &str) -> Vec<String> {
let mut urls = Vec::new();
// 1. bbImage: <img ... class="bbImage" ... src="URL"> or data-url="URL"
let img_re = Regex::new(r#"<img[^>]+class="bbImage[^"]*"[^>]*(?:data-url|src)="([^"]+)"|<img[^>]*(?:data-url|src)="([^"]+)"[^>]*class="bbImage[^"]*""#).unwrap();
for caps in img_re.captures_iter(html) {
if let Some(m) = caps.get(1).or(caps.get(2)) {
urls.push(m.as_str().to_string());
}
}
// 2. Video src
let video_re = Regex::new(r#"<video[^>]+src="([^"]+)"#).unwrap();
for caps in video_re.captures_iter(html) {
if let Some(m) = caps.get(1) {
urls.push(m.as_str().to_string());
}
}
// 3. Attachments
let attach_re = Regex::new(r#"<a[^>]+href="([^"]+/attachments/[^"]+)"#).unwrap();
for caps in attach_re.captures_iter(html) {
if let Some(m) = caps.get(1) {
urls.push(m.as_str().to_string());
}
}
// 4. Iframes
let iframe_re = Regex::new(r#"<iframe[^>]+src="([^"]+)"#).unwrap();
for caps in iframe_re.captures_iter(html) {
if let Some(m) = caps.get(1) {
urls.push(m.as_str().to_string());
}
}
// 5. Lazy-loaded media
let lazy_re = Regex::new(r#"loadMedia\(this,\s*'([^']+)'"#).unwrap();
for caps in lazy_re.captures_iter(html) {
if let Some(m) = caps.get(1) {
urls.push(m.as_str().to_string());
}
}
// Normalize and filter
urls.into_iter()
.filter_map(|u| normalize_url(&u, root_url))
.collect()
}
/// Normalize a URL: resolve relative paths, upgrade protocol, skip junk
fn normalize_url(url: &str, root_url: &str) -> Option<String> {
// Skip smilies, avatars, style assets, base64 data URIs
if url.contains("/styles/") || url.contains("/smilies/")
|| url.contains("data/avatars/") || url.contains("data:image")
|| url.contains("/icons/") || url.contains("reaction-sprite")
{
return None;
}
let mut u = url.to_string();
if u.starts_with("//") {
u = format!("https:{}", u);
} else if u.starts_with('/') {
u = format!("{}{}", root_url, u);
}
if !u.starts_with("http://") && !u.starts_with("https://") {
return None;
}
// Upgrade .md.jpg thumbnails to full size (simpcity CDN pattern)
u = upgrade_thumbnail(&u);
Some(u)
}
/// Upgrade simpcity CDN thumbnail URLs to full-size
/// e.g. image.md.jpg -> image.jpg
fn upgrade_thumbnail(url: &str) -> String {
let re = Regex::new(r"\.md\.(jpg|jpeg|png|gif|webp)(\?|$)").unwrap();
re.replace(url, ".$1$2").into_owned()
}
/// Decode common HTML entities
fn decode_html_entities(s: &str) -> String {
s.replace("&nbsp;", " ")
.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&quot;", "\"")
.replace("&#39;", "'")
.replace("&#x27;", "'")
}
/// Extract the thread title from the page HTML
fn extract_thread_title(html: &str) -> Option<String> {
let re = Regex::new(r#"<h1[^>]*class="[^"]*p-title-value[^"]*"[^>]*>(.*?)</h1>"#).ok()?;
re.captures(html)
.and_then(|c| c.get(1))
.map(|m| {
// Strip inner tags like <span>
let tag_re = Regex::new(r"<[^>]+>").unwrap();
let title = tag_re.replace_all(m.as_str().trim(), "").trim().to_string();
// Decode HTML entities
decode_html_entities(&title)
})
}
/// Find the next page URL from pagination
fn find_next_page(html: &str) -> Option<String> {
// Handle both attribute orderings: class before href, or href before class
let re = Regex::new(
r#"<a[^>]*href="([^"]+)"[^>]*class="[^"]*pageNav-jump--next[^"]*"|<a[^>]*class="[^"]*pageNav-jump--next[^"]*"[^>]*href="([^"]+)""#
).ok()?;
re.captures(html).and_then(|c| {
c.get(1).or(c.get(2))
}).map(|m| {
m.as_str().replace("&amp;", "&")
})
}
/// Extract individual post blocks from the page HTML.
///
/// XenForo posts are `<article>` elements with `data-content="post-NNNNN"`.
/// We split the HTML at each post boundary and extract the content between them.
fn extract_posts(html: &str) -> Vec<(String, String)> {
let boundary_re = Regex::new(r#"data-content="post-(\d+)""#)
.expect("Failed to compile post boundary regex");
let matches: Vec<_> = boundary_re.captures_iter(html)
.filter_map(|c| {
let full = c.get(0)?;
let id = c.get(1)?.as_str().to_string();
Some((id, full.start()))
})
.collect();
if matches.is_empty() {
return Vec::new();
}
let mut posts = Vec::new();
for i in 0..matches.len() {
let (ref id, start) = matches[i];
let end = if i + 1 < matches.len() {
matches[i + 1].1
} else {
html.len()
};
let post_html = &html[start..end];
posts.push((id.clone(), post_html.to_string()));
}
posts
}
// ============================================================================
// XenforoThreadExtractor
// ============================================================================
pub struct XenforoThreadExtractor {
pattern: Regex,
category: String,
subcategory: String,
root_url: String,
domain: Option<String>,
thread_path: Option<String>,
thread_id: Option<String>,
page: Option<i64>,
client: HttpClient,
cookies: HashMap<String, String>,
}
pub struct XenforoForumExtractor {
impl XenforoThreadExtractor {
pub fn new() -> Result<Self, ExtractorError> {
let pattern = Regex::new(
r"(?:https?://)?(?:www\.)?(simpcity\.cr|simpcity\.su|nudostar\.com/forum|allthefallen\.moe/forum|celebforum\.to|titsintops\.com/phpBB2|forums\.socialmediagirls\.com)(/(?:index\.php\?)?threads/(?:[^/?#]+\.)?(\d+))(?:/page-(\d+))?"
).map_err(|e| ExtractorError::ConfigError(e.to_string()))?;
Ok(Self {
pattern,
category: "xenforo".to_string(),
subcategory: "thread".to_string(),
root_url: "https://simpcity.cr".to_string(),
domain: None,
thread_path: None,
thread_id: None,
page: None,
cookies: HashMap::new(),
})
}
fn create_client(&self) -> Result<reqwest::Client, ExtractorError> {
reqwest::Client::builder()
.user_agent(USER_AGENT)
.timeout(std::time::Duration::from_secs(30))
.redirect(reqwest::redirect::Policy::limited(10))
.build()
.map_err(|e| ExtractorError::ConfigError(e.to_string()))
}
async fn fetch_page(&self, url: &str) -> Result<String, ExtractorError> {
let client = self.create_client()?;
let mut request = client.get(url);
if !self.cookies.is_empty() {
request = request.header("Cookie", cookie_header(&self.cookies));
}
let response = request.send().await
.map_err(ExtractorError::RequestFailed)?;
let status = response.status();
if status.as_u16() == 403 || status.as_u16() == 401 {
return Err(ExtractorError::ConfigError(format!(
"Authentication required (HTTP {}). Set cookies in config: \
extractor.xenforo.cookies.xf_user = \"your_cookie_value\"",
status.as_u16()
)));
}
if !status.is_success() {
return Err(ExtractorError::HttpError(format!("HTTP {}", status.as_u16())));
}
response.text().await
.map_err(|e| ExtractorError::ParseError(e.to_string()))
}
async fn extract_thread(&self) -> Result<Vec<Message>, ExtractorError> {
let thread_path = self.thread_path.as_ref()
.ok_or_else(|| ExtractorError::NotInitialized("thread_path not set".to_string()))?;
let mut messages = Vec::new();
let mut seen_urls: HashSet<String> = HashSet::new();
// Build the starting URL
let start_url = if let Some(page) = self.page {
format!("{}{}/page-{}", self.root_url, thread_path, page)
} else {
format!("{}{}/", self.root_url, thread_path)
};
let mut current_url = Some(start_url);
let mut page_num = self.page.unwrap_or(1);
let mut total_media = 0;
while let Some(url) = current_url.take() {
log::info!("Fetching page {} of thread: {}", page_num, url);
let html = self.fetch_page(&url).await?;
// Extract thread title on first page for the directory message
if page_num <= 1 || (self.page.is_some() && page_num == self.page.unwrap()) {
let title = extract_thread_title(&html)
.unwrap_or_else(|| "unknown".to_string());
log::info!("Thread title: {}", title);
let mut dir_msg = Message::directory("");
dir_msg.metadata.insert("thread_id".to_string(),
serde_json::json!(self.thread_id.as_deref().unwrap_or("0")));
dir_msg.metadata.insert("title".to_string(), serde_json::json!(title));
dir_msg.metadata.insert("category".to_string(), serde_json::json!("xenforo"));
messages.push(dir_msg);
}
// Extract posts and their media
let posts = extract_posts(&html);
log::info!("Found {} posts on page {}", posts.len(), page_num);
for (post_id, post_html) in &posts {
let media_urls = extract_media_from_html(post_html, &self.root_url);
for media_url in media_urls {
if seen_urls.contains(&media_url) {
continue;
}
seen_urls.insert(media_url.clone());
let msg = Message::url(&media_url)
.with_metadata("post_id", serde_json::json!(post_id))
.with_metadata("thread_id",
serde_json::json!(self.thread_id.as_deref().unwrap_or("0")));
// Try to extract a filename from the URL
if let Some(filename) = url_filename(&media_url) {
messages.push(msg.with_filename(filename));
} else {
messages.push(msg);
}
total_media += 1;
}
}
// If no posts found at all, try a simpler fallback: just extract all media from the page
if posts.is_empty() {
log::warn!("No post blocks found on page {} — trying full-page scan", page_num);
let media_urls = extract_media_from_html(&html, &self.root_url);
for media_url in media_urls {
if seen_urls.contains(&media_url) {
continue;
}
seen_urls.insert(media_url.clone());
let msg = Message::url(&media_url);
if let Some(filename) = url_filename(&media_url) {
messages.push(msg.with_filename(filename));
} else {
messages.push(msg);
}
total_media += 1;
}
}
// Check for next page
if let Some(next_href) = find_next_page(&html) {
let next_url = if next_href.starts_with("http") {
next_href
} else {
format!("{}{}", self.root_url, next_href)
};
current_url = Some(next_url);
page_num += 1;
}
}
log::info!("Extracted {} media URLs across {} pages", total_media, page_num);
Ok(messages)
}
}
/// Try to extract a usable filename from a URL
fn url_filename(url: &str) -> Option<String> {
let path = url::Url::parse(url).ok()?.path().to_string();
let segment = path.rsplit('/').next()?;
if segment.is_empty() || !segment.contains('.') {
return None;
}
// URL-decode the filename
let decoded = urlencoding::decode(segment).ok()?;
Some(decoded.into_owned())
}
impl Default for XenforoThreadExtractor {
fn default() -> Self {
Self::new().expect("Failed to create XenforoThreadExtractor")
}
}
impl Clone for XenforoThreadExtractor {
fn clone(&self) -> Self {
Self {
pattern: self.pattern.clone(),
category: self.category.clone(),
subcategory: self.subcategory.clone(),
root_url: self.root_url.clone(),
domain: self.domain.clone(),
thread_path: self.thread_path.clone(),
thread_id: self.thread_id.clone(),
page: self.page,
cookies: self.cookies.clone(),
}
}
}
#[async_trait]
impl Extractor for XenforoThreadExtractor {
fn category(&self) -> &str { &self.category }
fn subcategory(&self) -> &str { &self.subcategory }
fn root(&self) -> &str { &self.root_url }
fn pattern(&self) -> &Regex { &self.pattern }
fn clone_extractor(&self) -> Box<dyn Extractor> { Box::new(self.clone()) }
async fn initialize(&mut self, m: ExtractorMatch) -> Result<(), ExtractorError> {
if let Some(captures) = self.pattern.captures(&m.url) {
if let Some(domain) = captures.get(1) {
self.domain = Some(domain.as_str().to_string());
self.root_url = root_for_domain(domain.as_str());
}
if let Some(path) = captures.get(2) {
self.thread_path = Some(path.as_str().to_string());
}
if let Some(id) = captures.get(3) {
self.thread_id = Some(id.as_str().to_string());
}
if let Some(page) = captures.get(4) {
self.page = Some(page.as_str().parse::<i64>().unwrap_or(1));
}
}
log::info!("Initialized XenForo thread extractor: path={:?} id={:?} page={:?}",
self.thread_path, self.thread_id, self.page);
Ok(())
}
async fn items(&mut self) -> Result<Vec<Message>, ExtractorError> {
self.extract_thread().await
}
fn set_cookies(&mut self, cookies: HashMap<String, String>) {
self.cookies = cookies;
log::debug!("XenForo cookies set: {} entries", self.cookies.len());
}
}
// ============================================================================
// XenforoPostExtractor
// ============================================================================
pub struct XenforoPostExtractor {
pattern: Regex,
category: String,
subcategory: String,
root_url: String,
forum_id: Option<String>,
client: HttpClient,
domain: Option<String>,
post_id: Option<String>,
post_url_prefix: Option<String>,
cookies: HashMap<String, String>,
}
impl XenforoPostExtractor {
@@ -48,27 +470,79 @@ impl XenforoPostExtractor {
category: "xenforo".to_string(),
subcategory: "post".to_string(),
root_url: "https://simpcity.cr".to_string(),
domain: None,
post_id: None,
client: HttpClient::builder()
.user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.build()
.map_err(|e| ExtractorError::ConfigError(e.to_string()))?,
post_url_prefix: None,
cookies: HashMap::new(),
})
}
fn create_client(&self) -> Result<reqwest::Client, ExtractorError> {
reqwest::Client::builder()
.user_agent(USER_AGENT)
.timeout(std::time::Duration::from_secs(30))
.redirect(reqwest::redirect::Policy::limited(10))
.build()
.map_err(|e| ExtractorError::ConfigError(e.to_string()))
}
async fn extract_post(&self) -> Result<Vec<Message>, ExtractorError> {
let post_id = self.post_id.as_ref()
.ok_or_else(|| ExtractorError::NotInitialized("post_id not set".to_string()))?;
log::info!("Extracting XenForo post: {}", post_id);
let mut messages = Vec::new();
// Fetch the post page
let url = format!("{}/posts/{}/", self.root_url, post_id);
let client = self.create_client()?;
let mut request = client.get(&url);
if !self.cookies.is_empty() {
request = request.header("Cookie", cookie_header(&self.cookies));
}
let response = request.send().await
.map_err(ExtractorError::RequestFailed)?;
let status = response.status();
if !status.is_success() {
return Err(ExtractorError::HttpError(format!("HTTP {}", status.as_u16())));
}
let html = response.text().await
.map_err(|e| ExtractorError::ParseError(e.to_string()))?;
let mut messages = Vec::new();
let mut seen_urls: HashSet<String> = HashSet::new();
// Directory message
let mut dir_msg = Message::directory("");
dir_msg.metadata.insert("post_id".to_string(), serde_json::json!(post_id.parse::<i64>().unwrap_or(0)));
dir_msg.metadata.insert("post_id".to_string(), serde_json::json!(post_id));
messages.push(dir_msg);
log::info!("Found XenForo post {}", post_id);
// Try to find just the target post
let posts = extract_posts(&html);
let target_html = posts.iter()
.find(|(id, _)| id == post_id)
.map(|(_, content)| content.as_str())
.unwrap_or(&html);
let media_urls = extract_media_from_html(target_html, &self.root_url);
for media_url in media_urls {
if seen_urls.contains(&media_url) {
continue;
}
seen_urls.insert(media_url.clone());
let msg = Message::url(&media_url)
.with_metadata("post_id", serde_json::json!(post_id));
if let Some(filename) = url_filename(&media_url) {
messages.push(msg.with_filename(filename));
} else {
messages.push(msg);
}
}
log::info!("Extracted {} media URLs from post {}", messages.len() - 1, post_id);
Ok(messages)
}
}
@@ -86,11 +560,10 @@ impl Clone for XenforoPostExtractor {
category: self.category.clone(),
subcategory: self.subcategory.clone(),
root_url: self.root_url.clone(),
domain: self.domain.clone(),
post_id: self.post_id.clone(),
client: HttpClient::builder()
.user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.build()
.expect("Failed to create HTTP client"),
post_url_prefix: self.post_url_prefix.clone(),
cookies: self.cookies.clone(),
}
}
}
@@ -105,7 +578,14 @@ impl Extractor for XenforoPostExtractor {
async fn initialize(&mut self, m: ExtractorMatch) -> Result<(), ExtractorError> {
if let Some(captures) = self.pattern.captures(&m.url) {
if let Some(id) = captures.get(2) {
if let Some(domain) = captures.get(1) {
self.domain = Some(domain.as_str().to_string());
self.root_url = root_for_domain(domain.as_str());
}
if let Some(prefix) = captures.get(2) {
self.post_url_prefix = Some(prefix.as_str().to_string());
}
if let Some(id) = captures.get(3) {
self.post_id = Some(id.as_str().to_string());
}
}
@@ -115,91 +595,24 @@ impl Extractor for XenforoPostExtractor {
async fn items(&mut self) -> Result<Vec<Message>, ExtractorError> {
self.extract_post().await
}
}
impl XenforoThreadExtractor {
pub fn new() -> Result<Self, ExtractorError> {
let pattern = Regex::new(
r"(?:https?://)?(?:www\.)?(simpcity\.cr|simpcity\.su|nudostar\.com/forum|allthefallen\.moe/forum|celebforum\.to|titsintops\.com/phpBB2|forums\.socialmediagirls\.com)(/(?:index\.php\?)?threads/(?:[^/?#]+\.)?(\d+))(?:/page-(\d+))?"
).map_err(|e| ExtractorError::ConfigError(e.to_string()))?;
Ok(Self {
pattern,
category: "xenforo".to_string(),
subcategory: "thread".to_string(),
root_url: "https://simpcity.cr".to_string(),
thread_id: None,
page: None,
client: HttpClient::builder()
.user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.build()
.map_err(|e| ExtractorError::ConfigError(e.to_string()))?,
})
}
async fn extract_thread(&self) -> Result<Vec<Message>, ExtractorError> {
let thread_id = self.thread_id.as_ref()
.ok_or_else(|| ExtractorError::NotInitialized("thread_id not set".to_string()))?;
log::info!("Extracting XenForo thread: {}", thread_id);
let mut messages = Vec::new();
let mut dir_msg = Message::directory("");
dir_msg.metadata.insert("thread_id".to_string(), serde_json::json!(thread_id.parse::<i64>().unwrap_or(0)));
messages.push(dir_msg);
log::info!("Found XenForo thread {}", thread_id);
Ok(messages)
fn set_cookies(&mut self, cookies: HashMap<String, String>) {
self.cookies = cookies;
}
}
impl Default for XenforoThreadExtractor {
fn default() -> Self {
Self::new().expect("Failed to create XenforoThreadExtractor")
}
}
// ============================================================================
// XenforoForumExtractor
// ============================================================================
impl Clone for XenforoThreadExtractor {
fn clone(&self) -> Self {
Self {
pattern: self.pattern.clone(),
category: self.category.clone(),
subcategory: self.subcategory.clone(),
root_url: self.root_url.clone(),
thread_id: self.thread_id.clone(),
page: self.page.clone(),
client: HttpClient::builder()
.user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.build()
.expect("Failed to create HTTP client"),
}
}
}
#[async_trait]
impl Extractor for XenforoThreadExtractor {
fn category(&self) -> &str { &self.category }
fn subcategory(&self) -> &str { &self.subcategory }
fn root(&self) -> &str { &self.root_url }
fn pattern(&self) -> &Regex { &self.pattern }
fn clone_extractor(&self) -> Box<dyn Extractor> { Box::new(self.clone()) }
async fn initialize(&mut self, m: ExtractorMatch) -> Result<(), ExtractorError> {
if let Some(captures) = self.pattern.captures(&m.url) {
if let Some(id) = captures.get(2) {
self.thread_id = Some(id.as_str().to_string());
}
if let Some(page) = captures.get(3) {
self.page = Some(page.as_str().parse::<i64>().unwrap_or(1));
}
}
Ok(())
}
async fn items(&mut self) -> Result<Vec<Message>, ExtractorError> {
self.extract_thread().await
}
pub struct XenforoForumExtractor {
pattern: Regex,
category: String,
subcategory: String,
root_url: String,
domain: Option<String>,
forum_path: Option<String>,
cookies: HashMap<String, String>,
}
impl XenforoForumExtractor {
@@ -213,11 +626,9 @@ impl XenforoForumExtractor {
category: "xenforo".to_string(),
subcategory: "forum".to_string(),
root_url: "https://simpcity.cr".to_string(),
forum_id: None,
client: HttpClient::builder()
.user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.build()
.map_err(|e| ExtractorError::ConfigError(e.to_string()))?,
domain: None,
forum_path: None,
cookies: HashMap::new(),
})
}
}
@@ -235,11 +646,9 @@ impl Clone for XenforoForumExtractor {
category: self.category.clone(),
subcategory: self.subcategory.clone(),
root_url: self.root_url.clone(),
forum_id: self.forum_id.clone(),
client: HttpClient::builder()
.user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.build()
.expect("Failed to create HTTP client"),
domain: self.domain.clone(),
forum_path: self.forum_path.clone(),
cookies: self.cookies.clone(),
}
}
}
@@ -254,17 +663,25 @@ impl Extractor for XenforoForumExtractor {
async fn initialize(&mut self, m: ExtractorMatch) -> Result<(), ExtractorError> {
if let Some(captures) = self.pattern.captures(&m.url) {
if let Some(id) = captures.get(1) {
self.forum_id = Some(id.as_str().to_string());
if let Some(domain) = captures.get(1) {
self.domain = Some(domain.as_str().to_string());
self.root_url = root_for_domain(domain.as_str());
}
if let Some(path) = captures.get(2) {
self.forum_path = Some(path.as_str().to_string());
}
}
Ok(())
}
async fn items(&mut self) -> Result<Vec<Message>, ExtractorError> {
log::info!("Extracting XenForo forum");
log::info!("XenForo forum extractor not yet implemented");
Ok(vec![])
}
fn set_cookies(&mut self, cookies: HashMap<String, String>) {
self.cookies = cookies;
}
}
#[cfg(test)]
@@ -282,4 +699,116 @@ mod tests {
let extractor = XenforoThreadExtractor::new().unwrap();
assert!(extractor.pattern.is_match("https://simpcity.cr/threads/TITLE.12345/"));
}
#[test]
fn test_thread_pattern_with_page() {
let extractor = XenforoThreadExtractor::new().unwrap();
let url = "https://simpcity.cr/threads/dimeestevez.39618/page-2";
assert!(extractor.pattern.is_match(url));
let caps = extractor.pattern.captures(url).unwrap();
assert_eq!(caps.get(1).unwrap().as_str(), "simpcity.cr");
assert_eq!(caps.get(2).unwrap().as_str(), "/threads/dimeestevez.39618");
assert_eq!(caps.get(3).unwrap().as_str(), "39618");
assert_eq!(caps.get(4).unwrap().as_str(), "2");
}
#[test]
fn test_extract_media_from_html() {
let html = r#"
<img src="https://example.com/image1.jpg" class="bbImage " loading="lazy" />
<video src="https://example.com/video.mp4"></video>
<a href="https://example.com/attachments/file.zip">Download</a>
"#;
let urls = extract_media_from_html(html, "https://simpcity.cr");
assert_eq!(urls.len(), 3);
assert!(urls.contains(&"https://example.com/image1.jpg".to_string()));
assert!(urls.contains(&"https://example.com/video.mp4".to_string()));
assert!(urls.contains(&"https://example.com/attachments/file.zip".to_string()));
}
#[test]
fn test_extract_media_skips_smilies() {
let html = r#"
<img src="https://simpcity.cr/styles/emoji.png" class="bbImage" />
<img src="https://example.com/real-image.jpg" class="bbImage " loading="lazy" />
"#;
let urls = extract_media_from_html(html, "https://simpcity.cr");
assert_eq!(urls.len(), 1);
assert_eq!(urls[0], "https://example.com/real-image.jpg");
}
#[test]
fn test_upgrade_thumbnail() {
assert_eq!(
upgrade_thumbnail("https://simp1.selti-delivery.ru/images/test.md.jpg"),
"https://simp1.selti-delivery.ru/images/test.jpg"
);
assert_eq!(
upgrade_thumbnail("https://example.com/image.jpg"),
"https://example.com/image.jpg"
);
}
#[test]
fn test_extract_posts_from_real_html() {
let html = r#"
<article class="message" data-content="post-111" id="js-post-111">
<article class="message-body js-selectToQuote">
<img src="https://cdn.example.com/img1.jpg" class="bbImage " />
</article>
</article>
<article class="message" data-content="post-222" id="js-post-222">
<article class="message-body js-selectToQuote">
<img src="https://cdn.example.com/img2.jpg" class="bbImage " />
</article>
</article>
"#;
let posts = extract_posts(html);
assert_eq!(posts.len(), 2);
assert_eq!(posts[0].0, "111");
assert_eq!(posts[1].0, "222");
// Each post should yield its own image
let urls1 = extract_media_from_html(&posts[0].1, "https://simpcity.cr");
assert_eq!(urls1.len(), 1);
assert!(urls1[0].contains("img1.jpg"));
let urls2 = extract_media_from_html(&posts[1].1, "https://simpcity.cr");
assert_eq!(urls2.len(), 1);
assert!(urls2[0].contains("img2.jpg"));
}
#[test]
fn test_find_next_page() {
let html = r#"<a href="/threads/test.123/page-2" class="pageNav-jump pageNav-jump--next">Next</a>"#;
assert_eq!(find_next_page(html), Some("/threads/test.123/page-2".to_string()));
}
#[test]
fn test_find_next_page_none() {
let html = r#"<div>no pagination here</div>"#;
assert_eq!(find_next_page(html), None);
}
#[test]
fn test_extract_thread_title() {
let html = r#"<h1 class="p-title-value">Thread Title Here</h1>"#;
assert_eq!(extract_thread_title(html), Some("Thread Title Here".to_string()));
}
#[test]
fn test_url_filename() {
assert_eq!(
url_filename("https://example.com/path/to/image.jpg"),
Some("image.jpg".to_string())
);
assert_eq!(url_filename("https://example.com/"), None);
}
#[test]
fn test_root_for_domain() {
assert_eq!(root_for_domain("simpcity.cr"), "https://simpcity.cr");
assert_eq!(root_for_domain("nudostar.com/forum"), "https://nudostar.com/forum");
}
}

View File

@@ -86,6 +86,17 @@ fn write_page_dump(url: &str, items: &[Message]) {
let _ = std::fs::write(path, out);
}
/// Extract a usable filename from a URL path
fn url_to_filename(url: &str) -> Option<String> {
let parsed = url::Url::parse(url).ok()?;
let path = parsed.path();
let segment = path.rsplit('/').next()?;
if segment.is_empty() || !segment.contains('.') {
return None;
}
urlencoding::decode(segment).ok().map(|s| s.into_owned())
}
fn render_filename(pattern: Option<&str>, index: usize, item: &Message) -> String {
if let Some(template) = pattern {
let ext = item.extension().unwrap_or_else(|| "bin".to_string());
@@ -1092,9 +1103,14 @@ fn main() {
}
}
} else if let Some(ref browser) = args.cookies_from_browser {
match gallery_dl::extract_browser_cookies(browser, None) {
// Extract the domain from input URLs to filter browser cookies
let domain_filter: Option<String> = args.urls.first()
.and_then(|u| url::Url::parse(u).ok())
.and_then(|u| u.host_str().map(|h| h.to_string()));
match gallery_dl::extract_browser_cookies(browser, domain_filter.as_deref()) {
Ok(c) => {
log::info!("Extracted {} cookies from browser '{}'", c.len(), browser);
log::info!("Extracted {} cookies from browser '{}' (domain filter: {:?})", c.len(), browser, domain_filter);
Some(c)
}
Err(e) => {
@@ -1644,36 +1660,66 @@ fn main() {
let mut metadata_by_url: HashMap<String, HashMap<String, serde_json::Value>> =
HashMap::new();
// Determine download directory: CLI arg > config > default
let download_dir = args.directory.clone()
// Determine base download directory: CLI arg > config > default (Pictures/gallery-dl)
let base_dir = args.directory.clone()
.or_else(|| args.destination.clone())
.or_else(|| config.downloader.directory.clone())
.unwrap_or_else(|| PathBuf::from("."));
.unwrap_or_else(|| {
dirs::picture_dir()
.unwrap_or_else(|| PathBuf::from("."))
.join("gallery-dl")
});
// Extract directory metadata from the first Directory message
// to build subdirectory path: {category}/{title}/
let mut dir_category = String::new();
let mut dir_title = String::new();
for item in items.iter() {
if matches!(item.kind, MessageKind::Directory) {
if let Some(cat) = item.metadata.get("category") {
dir_category = cat.as_str().unwrap_or("").to_string();
}
if let Some(title) = item.metadata.get("title") {
dir_title = title.as_str().unwrap_or("").to_string();
}
break;
}
}
// Build the download directory with subdirectories
let download_dir = if !dir_category.is_empty() || !dir_title.is_empty() {
let cat = if dir_category.is_empty() { "other".to_string() } else {
sanitize_filename(&dir_category, args.restrict_filenames, true)
};
let title = if dir_title.is_empty() { "untitled".to_string() } else {
sanitize_filename(&dir_title, args.restrict_filenames, true)
};
base_dir.join(cat).join(title)
} else {
base_dir.clone()
};
for (j, item) in items.iter().enumerate() {
if !matches!(item.kind, MessageKind::Url | MessageKind::Queue) {
println!(" [{}] Skipping non-download message ({:?})", j + 1, item.kind);
continue;
}
let mut template_pattern = args
.rename_to
.as_deref()
.or(args.rename.as_deref())
.or(args.filename.as_deref())
.or(config.downloader.filename.as_deref());
if template_pattern.is_none() {
template_pattern = Some("{num}.{ext}");
}
// Create a simple destination path based on the URL
// In a full implementation, this would use path templates
let filename = render_filename(
template_pattern,
j,
item,
);
// Use the extractor-provided filename, or derive from URL, or fall back to template
let filename = if let Some(ref f) = item.filename {
f.clone()
} else if let Some(f) = url_to_filename(&item.url) {
f
} else {
let template_pattern = args
.rename_to
.as_deref()
.or(args.rename.as_deref())
.or(args.filename.as_deref())
.or(config.downloader.filename.as_deref())
.unwrap_or("{num}.{ext}");
render_filename(Some(template_pattern), j, item)
};
let filename = sanitize_filename(
&filename,
args.restrict_filenames,