feat: add Snapchat extractor, improve browser auth and XenForo support
- Add new Snapchat story extractor with spotlight and user story support - Expand browser cookie extraction to support Zen Browser and multi-platform profiles - Significantly enhance XenForo extractor with gallery, media, and attachment support - Add APPDATA-based profile discovery for Windows browsers - Update main.rs with new extractor wiring and improved CLI handling Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,4 +1,5 @@
|
|||||||
archive/
|
archive/
|
||||||
|
.claude/
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
|||||||
@@ -1,8 +1,15 @@
|
|||||||
//! Browser cookie extraction for Firefox and Chrome
|
//! Browser cookie extraction for Firefox-based browsers and Chrome
|
||||||
//!
|
//!
|
||||||
//! This module provides functionality to extract cookies directly from
|
//! This module provides functionality to extract cookies directly from
|
||||||
//! browser SQLite cookie databases, enabling seamless authentication
|
//! browser SQLite cookie databases, enabling seamless authentication
|
||||||
//! without manual cookie file exports.
|
//! without manual cookie file exports.
|
||||||
|
//!
|
||||||
|
//! Supported browsers:
|
||||||
|
//! - Firefox (all platforms)
|
||||||
|
//! - Zen Browser (Firefox-based)
|
||||||
|
//! - LibreWolf (Firefox-based)
|
||||||
|
//! - Waterfox (Firefox-based)
|
||||||
|
//! - Chrome / Chromium (all platforms, plaintext cookies only)
|
||||||
|
|
||||||
use rusqlite::Connection;
|
use rusqlite::Connection;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
@@ -60,6 +67,17 @@ fn get_home_dir() -> Result<PathBuf, BrowserError> {
|
|||||||
.ok_or_else(|| BrowserError::Other("Could not determine home directory".to_string()))
|
.ok_or_else(|| BrowserError::Other("Could not determine home directory".to_string()))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the APPDATA directory (Windows only, falls back to home)
|
||||||
|
fn get_appdata_dir() -> Result<PathBuf, BrowserError> {
|
||||||
|
// Try APPDATA env var first (Windows)
|
||||||
|
if let Ok(appdata) = std::env::var("APPDATA") {
|
||||||
|
return Ok(PathBuf::from(appdata));
|
||||||
|
}
|
||||||
|
// Fallback: use dirs crate
|
||||||
|
dirs::config_dir()
|
||||||
|
.ok_or_else(|| BrowserError::Other("Could not determine config directory".to_string()))
|
||||||
|
}
|
||||||
|
|
||||||
/// Copy a file to a temporary location to avoid locking issues
|
/// Copy a file to a temporary location to avoid locking issues
|
||||||
fn copy_to_temp<P: AsRef<std::path::Path>>(path: P) -> Result<tempfile::TempPath, BrowserError> {
|
fn copy_to_temp<P: AsRef<std::path::Path>>(path: P) -> Result<tempfile::TempPath, BrowserError> {
|
||||||
let temp_file = tempfile::NamedTempFile::new()?;
|
let temp_file = tempfile::NamedTempFile::new()?;
|
||||||
@@ -67,32 +85,26 @@ fn copy_to_temp<P: AsRef<std::path::Path>>(path: P) -> Result<tempfile::TempPath
|
|||||||
Ok(temp_file.into_temp_path())
|
Ok(temp_file.into_temp_path())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Find the Firefox profile directory
|
/// Find a profile directory from a list of candidate parent directories.
|
||||||
///
|
///
|
||||||
/// Searches in ~/.mozilla/firefox/ for profiles
|
/// Searches each candidate for subdirectories containing `cookies.sqlite`.
|
||||||
pub fn find_firefox_profile() -> Result<PathBuf, BrowserError> {
|
/// Prefers `default-release` profiles, then `default` profiles.
|
||||||
let home = get_home_dir()?;
|
fn find_profile_in_dirs(candidate_dirs: &[PathBuf]) -> Result<PathBuf, BrowserError> {
|
||||||
let firefox_dir = home.join(".mozilla").join("firefox");
|
for dir in candidate_dirs {
|
||||||
|
if !dir.exists() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if !firefox_dir.exists() {
|
let entries = match fs::read_dir(dir) {
|
||||||
return Err(BrowserError::ProfileNotFound(format!(
|
Ok(e) => e,
|
||||||
"Firefox directory not found: {:?}",
|
Err(_) => continue,
|
||||||
firefox_dir
|
};
|
||||||
)));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Read directory entries
|
let mut profile_dirs: Vec<(String, PathBuf)> = Vec::new();
|
||||||
let entries = fs::read_dir(&firefox_dir).map_err(|e| BrowserError::Io(e))?;
|
|
||||||
|
|
||||||
let mut profile_dirs: Vec<(String, PathBuf)> = Vec::new();
|
for entry in entries.flatten() {
|
||||||
|
let path = entry.path();
|
||||||
for entry in entries.flatten() {
|
if path.is_dir() && path.join("cookies.sqlite").exists() {
|
||||||
let path = entry.path();
|
|
||||||
if path.is_dir() {
|
|
||||||
// Check if this is a profile directory (contains cookies.sqlite)
|
|
||||||
let cookies_path = path.join("cookies.sqlite");
|
|
||||||
if cookies_path.exists() {
|
|
||||||
// Get the profile name from the directory name
|
|
||||||
let name = path
|
let name = path
|
||||||
.file_name()
|
.file_name()
|
||||||
.and_then(|n| n.to_str())
|
.and_then(|n| n.to_str())
|
||||||
@@ -101,46 +113,112 @@ pub fn find_firefox_profile() -> Result<PathBuf, BrowserError> {
|
|||||||
profile_dirs.push((name, path));
|
profile_dirs.push((name, path));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if profile_dirs.is_empty() {
|
if profile_dirs.is_empty() {
|
||||||
return Err(BrowserError::ProfileNotFound(
|
continue;
|
||||||
"No Firefox profiles with cookies found".to_string(),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Prefer default-release profile, otherwise use first available
|
|
||||||
profile_dirs.sort_by(|a, b| {
|
|
||||||
let a_default = a.0.contains("default-release");
|
|
||||||
let b_default = b.0.contains("default-release");
|
|
||||||
match (a_default, b_default) {
|
|
||||||
(true, false) => std::cmp::Ordering::Less,
|
|
||||||
(false, true) => std::cmp::Ordering::Greater,
|
|
||||||
_ => std::cmp::Ordering::Equal,
|
|
||||||
}
|
}
|
||||||
});
|
|
||||||
|
|
||||||
let selected = &profile_dirs[0].1;
|
// Sort: prefer default-release > default > anything else
|
||||||
log::info!("Found Firefox profile: {:?}", selected);
|
profile_dirs.sort_by(|a, b| {
|
||||||
Ok(selected.clone())
|
fn rank(name: &str) -> u8 {
|
||||||
|
if name.contains("default-release") { 0 }
|
||||||
|
else if name.contains("default") { 1 }
|
||||||
|
else { 2 }
|
||||||
|
}
|
||||||
|
rank(&a.0).cmp(&rank(&b.0))
|
||||||
|
});
|
||||||
|
|
||||||
|
let selected = &profile_dirs[0].1;
|
||||||
|
log::info!("Found browser profile: {:?}", selected);
|
||||||
|
return Ok(selected.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
Err(BrowserError::ProfileNotFound(format!(
|
||||||
|
"No profiles with cookies found. Searched: {:?}",
|
||||||
|
candidate_dirs
|
||||||
|
)))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract cookies from Firefox profile
|
/// Get candidate profile directories for Firefox
|
||||||
///
|
fn firefox_profile_dirs() -> Vec<PathBuf> {
|
||||||
/// # Arguments
|
let mut dirs = Vec::new();
|
||||||
/// * `domain` - Optional domain to filter cookies (e.g., ".twitter.com")
|
if let Ok(home) = get_home_dir() {
|
||||||
///
|
// Windows
|
||||||
/// Returns a HashMap of cookie name -> value
|
if let Ok(appdata) = get_appdata_dir() {
|
||||||
pub fn extract_firefox_cookies(
|
dirs.push(appdata.join("Mozilla").join("Firefox").join("Profiles"));
|
||||||
|
}
|
||||||
|
// Linux
|
||||||
|
dirs.push(home.join(".mozilla").join("firefox"));
|
||||||
|
// macOS
|
||||||
|
dirs.push(home.join("Library").join("Application Support").join("Firefox").join("Profiles"));
|
||||||
|
}
|
||||||
|
dirs
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get candidate profile directories for Zen Browser
|
||||||
|
fn zen_profile_dirs() -> Vec<PathBuf> {
|
||||||
|
let mut dirs = Vec::new();
|
||||||
|
if let Ok(home) = get_home_dir() {
|
||||||
|
// Windows
|
||||||
|
if let Ok(appdata) = get_appdata_dir() {
|
||||||
|
dirs.push(appdata.join("zen").join("Profiles"));
|
||||||
|
}
|
||||||
|
// Linux
|
||||||
|
dirs.push(home.join(".zen"));
|
||||||
|
// macOS
|
||||||
|
dirs.push(home.join("Library").join("Application Support").join("zen").join("Profiles"));
|
||||||
|
}
|
||||||
|
dirs
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get candidate profile directories for LibreWolf
|
||||||
|
fn librewolf_profile_dirs() -> Vec<PathBuf> {
|
||||||
|
let mut dirs = Vec::new();
|
||||||
|
if let Ok(home) = get_home_dir() {
|
||||||
|
if let Ok(appdata) = get_appdata_dir() {
|
||||||
|
dirs.push(appdata.join("librewolf").join("Profiles"));
|
||||||
|
}
|
||||||
|
dirs.push(home.join(".librewolf"));
|
||||||
|
dirs.push(home.join("Library").join("Application Support").join("librewolf").join("Profiles"));
|
||||||
|
}
|
||||||
|
dirs
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get candidate profile directories for Waterfox
|
||||||
|
fn waterfox_profile_dirs() -> Vec<PathBuf> {
|
||||||
|
let mut dirs = Vec::new();
|
||||||
|
if let Ok(home) = get_home_dir() {
|
||||||
|
if let Ok(appdata) = get_appdata_dir() {
|
||||||
|
dirs.push(appdata.join("Waterfox").join("Profiles"));
|
||||||
|
}
|
||||||
|
dirs.push(home.join(".waterfox"));
|
||||||
|
dirs.push(home.join("Library").join("Application Support").join("Waterfox").join("Profiles"));
|
||||||
|
}
|
||||||
|
dirs
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Find a Firefox profile directory (searches standard Firefox locations)
|
||||||
|
pub fn find_firefox_profile() -> Result<PathBuf, BrowserError> {
|
||||||
|
find_profile_in_dirs(&firefox_profile_dirs())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Find a Zen Browser profile directory
|
||||||
|
pub fn find_zen_profile() -> Result<PathBuf, BrowserError> {
|
||||||
|
find_profile_in_dirs(&zen_profile_dirs())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract cookies from a Firefox-compatible SQLite database (moz_cookies table)
|
||||||
|
fn extract_moz_cookies(
|
||||||
|
profile_dir: &PathBuf,
|
||||||
domain: Option<&str>,
|
domain: Option<&str>,
|
||||||
|
browser_name: &str,
|
||||||
) -> Result<HashMap<String, String>, BrowserError> {
|
) -> Result<HashMap<String, String>, BrowserError> {
|
||||||
let profile_dir = find_firefox_profile()?;
|
|
||||||
let cookies_path = profile_dir.join("cookies.sqlite");
|
let cookies_path = profile_dir.join("cookies.sqlite");
|
||||||
|
|
||||||
if !cookies_path.exists() {
|
if !cookies_path.exists() {
|
||||||
return Err(BrowserError::DatabaseNotFound(format!(
|
return Err(BrowserError::DatabaseNotFound(format!(
|
||||||
"Firefox cookies database not found: {:?}",
|
"{} cookies database not found: {:?}",
|
||||||
cookies_path
|
browser_name, cookies_path
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -150,7 +228,6 @@ pub fn extract_firefox_cookies(
|
|||||||
|
|
||||||
let cookies: HashMap<String, String> = match domain {
|
let cookies: HashMap<String, String> = match domain {
|
||||||
Some(d) => {
|
Some(d) => {
|
||||||
// Query with domain filter
|
|
||||||
let pattern = format!("%{}", d);
|
let pattern = format!("%{}", d);
|
||||||
let mut stmt = conn.prepare("SELECT name, value FROM moz_cookies WHERE host LIKE ?")?;
|
let mut stmt = conn.prepare("SELECT name, value FROM moz_cookies WHERE host LIKE ?")?;
|
||||||
let mut cookies = HashMap::new();
|
let mut cookies = HashMap::new();
|
||||||
@@ -163,7 +240,6 @@ pub fn extract_firefox_cookies(
|
|||||||
cookies
|
cookies
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
// Get all cookies
|
|
||||||
let mut stmt = conn.prepare("SELECT name, value FROM moz_cookies")?;
|
let mut stmt = conn.prepare("SELECT name, value FROM moz_cookies")?;
|
||||||
let mut cookies = HashMap::new();
|
let mut cookies = HashMap::new();
|
||||||
let rows = stmt.query_map([], |row| {
|
let rows = stmt.query_map([], |row| {
|
||||||
@@ -176,24 +252,61 @@ pub fn extract_firefox_cookies(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
log::info!("Extracted {} cookies from Firefox", cookies.len());
|
log::info!("Extracted {} cookies from {}", cookies.len(), browser_name);
|
||||||
Ok(cookies)
|
Ok(cookies)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Extract cookies from Firefox
|
||||||
|
pub fn extract_firefox_cookies(
|
||||||
|
domain: Option<&str>,
|
||||||
|
) -> Result<HashMap<String, String>, BrowserError> {
|
||||||
|
let profile_dir = find_profile_in_dirs(&firefox_profile_dirs())?;
|
||||||
|
extract_moz_cookies(&profile_dir, domain, "Firefox")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract cookies from Zen Browser
|
||||||
|
pub fn extract_zen_cookies(
|
||||||
|
domain: Option<&str>,
|
||||||
|
) -> Result<HashMap<String, String>, BrowserError> {
|
||||||
|
let profile_dir = find_profile_in_dirs(&zen_profile_dirs())?;
|
||||||
|
extract_moz_cookies(&profile_dir, domain, "Zen Browser")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract cookies from LibreWolf
|
||||||
|
pub fn extract_librewolf_cookies(
|
||||||
|
domain: Option<&str>,
|
||||||
|
) -> Result<HashMap<String, String>, BrowserError> {
|
||||||
|
let profile_dir = find_profile_in_dirs(&librewolf_profile_dirs())?;
|
||||||
|
extract_moz_cookies(&profile_dir, domain, "LibreWolf")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract cookies from Waterfox
|
||||||
|
pub fn extract_waterfox_cookies(
|
||||||
|
domain: Option<&str>,
|
||||||
|
) -> Result<HashMap<String, String>, BrowserError> {
|
||||||
|
let profile_dir = find_profile_in_dirs(&waterfox_profile_dirs())?;
|
||||||
|
extract_moz_cookies(&profile_dir, domain, "Waterfox")
|
||||||
|
}
|
||||||
|
|
||||||
/// Find the Chrome profile directory
|
/// Find the Chrome profile directory
|
||||||
///
|
|
||||||
/// Searches in ~/.config/google-chrome/ for Default profile
|
|
||||||
pub fn find_chrome_profile() -> Result<PathBuf, BrowserError> {
|
pub fn find_chrome_profile() -> Result<PathBuf, BrowserError> {
|
||||||
let home = get_home_dir()?;
|
let home = get_home_dir()?;
|
||||||
|
|
||||||
// Try different possible Chrome config locations
|
let mut possible_paths = Vec::new();
|
||||||
let possible_paths = vec![
|
|
||||||
home.join(".config").join("google-chrome"),
|
// Windows
|
||||||
home.join(".config").join("chromium"),
|
if let Ok(local_appdata) = std::env::var("LOCALAPPDATA") {
|
||||||
home.join("Library")
|
let local = PathBuf::from(local_appdata);
|
||||||
.join("Application Support")
|
possible_paths.push(local.join("Google").join("Chrome").join("User Data"));
|
||||||
.join("Google Chrome"),
|
possible_paths.push(local.join("Chromium").join("User Data"));
|
||||||
];
|
}
|
||||||
|
|
||||||
|
// Linux
|
||||||
|
possible_paths.push(home.join(".config").join("google-chrome"));
|
||||||
|
possible_paths.push(home.join(".config").join("chromium"));
|
||||||
|
|
||||||
|
// macOS
|
||||||
|
possible_paths.push(home.join("Library").join("Application Support").join("Google Chrome"));
|
||||||
|
|
||||||
for chrome_dir in possible_paths {
|
for chrome_dir in possible_paths {
|
||||||
if chrome_dir.exists() {
|
if chrome_dir.exists() {
|
||||||
@@ -215,13 +328,8 @@ pub fn find_chrome_profile() -> Result<PathBuf, BrowserError> {
|
|||||||
|
|
||||||
/// Extract cookies from Chrome profile
|
/// Extract cookies from Chrome profile
|
||||||
///
|
///
|
||||||
/// Note: Chrome stores some cookies with encrypted values using the OS keyring.
|
/// Note: Chrome encrypts most cookies using the OS keyring.
|
||||||
/// This function extracts plaintext cookies and logs a warning for encrypted ones.
|
/// This function extracts plaintext cookies and skips encrypted ones.
|
||||||
///
|
|
||||||
/// # Arguments
|
|
||||||
/// * `domain` - Optional domain to filter cookies (e.g., ".twitter.com")
|
|
||||||
///
|
|
||||||
/// Returns a HashMap of cookie name -> value
|
|
||||||
pub fn extract_chrome_cookies(
|
pub fn extract_chrome_cookies(
|
||||||
domain: Option<&str>,
|
domain: Option<&str>,
|
||||||
) -> Result<HashMap<String, String>, BrowserError> {
|
) -> Result<HashMap<String, String>, BrowserError> {
|
||||||
@@ -235,14 +343,12 @@ pub fn extract_chrome_cookies(
|
|||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copy to temp to avoid locking
|
|
||||||
let temp_path = copy_to_temp(&cookies_path)?;
|
let temp_path = copy_to_temp(&cookies_path)?;
|
||||||
let conn = Connection::open(&temp_path)?;
|
let conn = Connection::open(&temp_path)?;
|
||||||
|
|
||||||
let mut cookies = HashMap::new();
|
let mut cookies = HashMap::new();
|
||||||
let mut encrypted_count = 0;
|
let mut encrypted_count = 0;
|
||||||
|
|
||||||
// Chrome uses different table schema - check for encrypted_value column
|
|
||||||
let has_encrypted = conn
|
let has_encrypted = conn
|
||||||
.query_row(
|
.query_row(
|
||||||
"SELECT COUNT(*) FROM pragma_table_info('cookies') WHERE name='encrypted_value'",
|
"SELECT COUNT(*) FROM pragma_table_info('cookies') WHERE name='encrypted_value'",
|
||||||
@@ -252,14 +358,13 @@ pub fn extract_chrome_cookies(
|
|||||||
.unwrap_or(0)
|
.unwrap_or(0)
|
||||||
> 0;
|
> 0;
|
||||||
|
|
||||||
// Always select with domain filter (use wildcard for all)
|
|
||||||
let domain_pattern = match domain {
|
let domain_pattern = match domain {
|
||||||
Some(d) => format!("%{}%", d),
|
Some(d) => format!("%{}%", d),
|
||||||
None => "%".to_string(),
|
None => "%".to_string(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut stmt =
|
let mut stmt =
|
||||||
conn.prepare("SELECT name, value, encrypted_value FROM cookies WHERE host LIKE ?")?;
|
conn.prepare("SELECT name, value, encrypted_value FROM cookies WHERE host_key LIKE ?")?;
|
||||||
|
|
||||||
let rows = stmt.query_map([domain_pattern], |row| {
|
let rows = stmt.query_map([domain_pattern], |row| {
|
||||||
let name: String = row.get(0)?;
|
let name: String = row.get(0)?;
|
||||||
@@ -271,12 +376,11 @@ pub fn extract_chrome_cookies(
|
|||||||
for row_result in rows {
|
for row_result in rows {
|
||||||
let (name, value, encrypted) = row_result?;
|
let (name, value, encrypted) = row_result?;
|
||||||
|
|
||||||
// Check if cookie has encrypted value
|
|
||||||
if has_encrypted {
|
if has_encrypted {
|
||||||
if let Some(enc) = encrypted {
|
if let Some(enc) = encrypted {
|
||||||
if !enc.is_empty() {
|
if !enc.is_empty() {
|
||||||
encrypted_count += 1;
|
encrypted_count += 1;
|
||||||
continue; // Skip encrypted cookies
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -287,7 +391,7 @@ pub fn extract_chrome_cookies(
|
|||||||
if encrypted_count > 0 {
|
if encrypted_count > 0 {
|
||||||
log::warn!(
|
log::warn!(
|
||||||
"Skipped {} encrypted Chrome cookies (OS keyring required). \
|
"Skipped {} encrypted Chrome cookies (OS keyring required). \
|
||||||
Run with --cookies-file for encrypted cookies.",
|
Use --cookies with a cookies.txt file instead.",
|
||||||
encrypted_count
|
encrypted_count
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -300,32 +404,22 @@ pub fn extract_chrome_cookies(
|
|||||||
Ok(cookies)
|
Ok(cookies)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract cookies from a browser
|
/// Extract cookies from a browser by name
|
||||||
///
|
///
|
||||||
/// # Arguments
|
/// Supported browsers: firefox, zen, librewolf, waterfox, chrome, chromium
|
||||||
/// * `browser` - Browser name: "firefox", "chrome", or "chromium"
|
|
||||||
/// * `domain` - Optional domain to filter cookies
|
|
||||||
///
|
|
||||||
/// # Example
|
|
||||||
/// ```no_run
|
|
||||||
/// use gallery_dl::auth::extract_browser_cookies;
|
|
||||||
///
|
|
||||||
/// // Get all cookies from Firefox
|
|
||||||
/// let cookies = extract_browser_cookies("firefox", None).unwrap();
|
|
||||||
///
|
|
||||||
/// // Get Twitter cookies from Chrome
|
|
||||||
/// let twitter_cookies = extract_browser_cookies("chrome", Some("twitter.com")).unwrap();
|
|
||||||
/// ```
|
|
||||||
pub fn extract_browser_cookies(
|
pub fn extract_browser_cookies(
|
||||||
browser: &str,
|
browser: &str,
|
||||||
domain: Option<&str>,
|
domain: Option<&str>,
|
||||||
) -> Result<HashMap<String, String>, BrowserError> {
|
) -> Result<HashMap<String, String>, BrowserError> {
|
||||||
match browser.to_lowercase().as_str() {
|
match browser.to_lowercase().as_str() {
|
||||||
"firefox" | "ff" => extract_firefox_cookies(domain),
|
"firefox" | "ff" => extract_firefox_cookies(domain),
|
||||||
|
"zen" | "zen-browser" => extract_zen_cookies(domain),
|
||||||
|
"librewolf" => extract_librewolf_cookies(domain),
|
||||||
|
"waterfox" => extract_waterfox_cookies(domain),
|
||||||
"chrome" | "google-chrome" => extract_chrome_cookies(domain),
|
"chrome" | "google-chrome" => extract_chrome_cookies(domain),
|
||||||
"chromium" => extract_chrome_cookies(domain),
|
"chromium" => extract_chrome_cookies(domain),
|
||||||
_ => Err(BrowserError::Other(format!(
|
_ => Err(BrowserError::Other(format!(
|
||||||
"Unsupported browser: {}. Supported: firefox, chrome, chromium",
|
"Unsupported browser: '{}'. Supported: firefox, zen, librewolf, waterfox, chrome, chromium",
|
||||||
browser
|
browser
|
||||||
))),
|
))),
|
||||||
}
|
}
|
||||||
@@ -334,7 +428,6 @@ pub fn extract_browser_cookies(
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use std::env;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_get_home_dir() {
|
fn test_get_home_dir() {
|
||||||
@@ -350,25 +443,26 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_extract_browser_cookies_case_insensitive() {
|
fn test_extract_browser_cookies_case_insensitive() {
|
||||||
// Should not error, just return empty or ProfileNotFound
|
|
||||||
let result = extract_browser_cookies("FIREFOX", None);
|
let result = extract_browser_cookies("FIREFOX", None);
|
||||||
// Either works or profile not found (acceptable in test env)
|
assert!(result.is_ok() || matches!(result, Err(BrowserError::ProfileNotFound(_))));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_zen_browser_recognized() {
|
||||||
|
let result = extract_browser_cookies("zen", None);
|
||||||
|
// Should be ProfileNotFound (not unsupported browser error)
|
||||||
assert!(result.is_ok() || matches!(result, Err(BrowserError::ProfileNotFound(_))));
|
assert!(result.is_ok() || matches!(result, Err(BrowserError::ProfileNotFound(_))));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_firefox_cookies_with_domain() {
|
fn test_firefox_cookies_with_domain() {
|
||||||
// Should not error even if profile not found in test env
|
|
||||||
let result = extract_firefox_cookies(Some("twitter.com"));
|
let result = extract_firefox_cookies(Some("twitter.com"));
|
||||||
// Either works or profile not found (acceptable in test env)
|
|
||||||
assert!(result.is_ok() || matches!(result, Err(BrowserError::ProfileNotFound(_))));
|
assert!(result.is_ok() || matches!(result, Err(BrowserError::ProfileNotFound(_))));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_chrome_cookies_with_domain() {
|
fn test_chrome_cookies_with_domain() {
|
||||||
// Should not error even if profile not found in test env
|
|
||||||
let result = extract_chrome_cookies(Some("twitter.com"));
|
let result = extract_chrome_cookies(Some("twitter.com"));
|
||||||
// Either works or profile not found (acceptable in test env)
|
|
||||||
assert!(result.is_ok() || matches!(result, Err(BrowserError::ProfileNotFound(_))));
|
assert!(result.is_ok() || matches!(result, Err(BrowserError::ProfileNotFound(_))));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -246,6 +246,7 @@ mod rawkuma;
|
|||||||
mod readcomiconline;
|
mod readcomiconline;
|
||||||
mod schalenetwork;
|
mod schalenetwork;
|
||||||
mod shimmie2;
|
mod shimmie2;
|
||||||
|
mod snapchat;
|
||||||
mod tungsten;
|
mod tungsten;
|
||||||
mod weebdex;
|
mod weebdex;
|
||||||
mod xenforo;
|
mod xenforo;
|
||||||
@@ -861,6 +862,10 @@ pub fn register_all() {
|
|||||||
// Register SimplyHentai extractors (simplyhentai.com)
|
// Register SimplyHentai extractors (simplyhentai.com)
|
||||||
register(simplyhentai::SimplyhentaiExtractor::new().expect("Failed to create SimplyHentai extractor"));
|
register(simplyhentai::SimplyhentaiExtractor::new().expect("Failed to create SimplyHentai extractor"));
|
||||||
|
|
||||||
|
// Register Snapchat extractors (snapchat.com)
|
||||||
|
register(snapchat::SnapchatSpotlightExtractor::new());
|
||||||
|
register(snapchat::SnapchatProfileExtractor::new());
|
||||||
|
|
||||||
// Register Skeb extractors (skeb.jp)
|
// Register Skeb extractors (skeb.jp)
|
||||||
register(skeb::SkebExtractor::new());
|
register(skeb::SkebExtractor::new());
|
||||||
|
|
||||||
|
|||||||
523
src/extractor/extractors/snapchat.rs
Normal file
523
src/extractor/extractors/snapchat.rs
Normal file
@@ -0,0 +1,523 @@
|
|||||||
|
//! Snapchat extractor implementation
|
||||||
|
//!
|
||||||
|
//! Supports public Snapchat content:
|
||||||
|
//! - Spotlight videos: `snapchat.com/spotlight/{id}`
|
||||||
|
//! - Public profiles/stories: `snapchat.com/add/{username}`
|
||||||
|
//!
|
||||||
|
//! Data is extracted from the `__NEXT_DATA__` JSON embedded in the page HTML
|
||||||
|
//! (Next.js server-side rendering). No authentication required for public content.
|
||||||
|
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use regex::Regex;
|
||||||
|
use serde_json::Value;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use crate::extractor::{Extractor, ExtractorError, ExtractorMatch, Message};
|
||||||
|
|
||||||
|
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
|
||||||
|
|
||||||
|
/// Extract the `__NEXT_DATA__` JSON blob from a Snapchat page
|
||||||
|
fn extract_next_data(html: &str) -> Option<Value> {
|
||||||
|
let re = Regex::new(r#"<script\s+id="__NEXT_DATA__"\s+type="application/json"[^>]*>(.*?)</script>"#).ok()?;
|
||||||
|
let caps = re.captures(html)?;
|
||||||
|
let json_str = caps.get(1)?.as_str();
|
||||||
|
serde_json::from_str(json_str).ok()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Recursively search a JSON value for all occurrences of a key
|
||||||
|
fn find_all_values<'a>(json: &'a Value, key: &str) -> Vec<&'a Value> {
|
||||||
|
let mut results = Vec::new();
|
||||||
|
match json {
|
||||||
|
Value::Object(map) => {
|
||||||
|
for (k, v) in map {
|
||||||
|
if k == key {
|
||||||
|
results.push(v);
|
||||||
|
}
|
||||||
|
results.extend(find_all_values(v, key));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Value::Array(arr) => {
|
||||||
|
for v in arr {
|
||||||
|
results.extend(find_all_values(v, key));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
results
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract a filename from a CDN URL
|
||||||
|
/// e.g. `https://cf-st.sc-cdn.net/d/ABCDEF.27.IRZXSOY?mo=...` -> `ABCDEF.mp4`
|
||||||
|
fn cdn_filename(url: &str) -> Option<String> {
|
||||||
|
let parsed = url::Url::parse(url).ok()?;
|
||||||
|
let path = parsed.path();
|
||||||
|
// Path is like /d/HASH.27.IRZXSOY or /TYPE/HASH.27.IRZXSOY
|
||||||
|
let segment = path.rsplit('/').next()?;
|
||||||
|
// Take everything before the first dot as the hash ID
|
||||||
|
let hash = segment.split('.').next()?;
|
||||||
|
if hash.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
Some(format!("{}.mp4", hash))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// SnapchatSpotlightExtractor — single spotlight video
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct SnapchatSpotlightExtractor {
|
||||||
|
pattern: Regex,
|
||||||
|
spotlight_id: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SnapchatSpotlightExtractor {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
pattern: Regex::new(
|
||||||
|
r"(?:https?://)?(?:www\.)?snapchat\.com/spotlight/([A-Za-z0-9_-]+)"
|
||||||
|
).expect("Failed to compile Snapchat spotlight pattern"),
|
||||||
|
spotlight_id: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn create_client(&self) -> Result<reqwest::Client, ExtractorError> {
|
||||||
|
reqwest::Client::builder()
|
||||||
|
.user_agent(USER_AGENT)
|
||||||
|
.timeout(std::time::Duration::from_secs(30))
|
||||||
|
.redirect(reqwest::redirect::Policy::limited(10))
|
||||||
|
.build()
|
||||||
|
.map_err(|e| ExtractorError::ConfigError(e.to_string()))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn fetch_page(&self, url: &str) -> Result<String, ExtractorError> {
|
||||||
|
let client = self.create_client()?;
|
||||||
|
let response = client.get(url).send().await
|
||||||
|
.map_err(ExtractorError::RequestFailed)?;
|
||||||
|
|
||||||
|
let status = response.status();
|
||||||
|
if status.as_u16() == 404 {
|
||||||
|
return Err(ExtractorError::NotFound(format!("Spotlight not found: {}", url)));
|
||||||
|
}
|
||||||
|
if !status.is_success() {
|
||||||
|
return Err(ExtractorError::HttpError(format!("HTTP {}", status.as_u16())));
|
||||||
|
}
|
||||||
|
|
||||||
|
response.text().await
|
||||||
|
.map_err(|e| ExtractorError::ParseError(e.to_string()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_videos_from_next_data(&self, data: &Value) -> Vec<(String, HashMap<String, Value>)> {
|
||||||
|
let mut videos = Vec::new();
|
||||||
|
|
||||||
|
// Look for contentUrl fields (direct video URLs)
|
||||||
|
let content_urls = find_all_values(data, "contentUrl");
|
||||||
|
for url_val in &content_urls {
|
||||||
|
if let Some(url) = url_val.as_str() {
|
||||||
|
if url.contains("sc-cdn.net") || url.contains(".mp4") {
|
||||||
|
let mut meta = HashMap::new();
|
||||||
|
videos.push((url.to_string(), meta));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Also check mediaUrl as a fallback
|
||||||
|
if videos.is_empty() {
|
||||||
|
let media_urls = find_all_values(data, "mediaUrl");
|
||||||
|
for url_val in &media_urls {
|
||||||
|
if let Some(url) = url_val.as_str() {
|
||||||
|
if url.contains("sc-cdn.net") || url.contains(".mp4") {
|
||||||
|
let meta = HashMap::new();
|
||||||
|
videos.push((url.to_string(), meta));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enrich with metadata from the same JSON tree
|
||||||
|
// Try to find upload date, view count, creator info
|
||||||
|
let upload_dates = find_all_values(data, "uploadDateMs");
|
||||||
|
let view_counts = find_all_values(data, "viewCount");
|
||||||
|
let usernames = find_all_values(data, "username");
|
||||||
|
let display_names = find_all_values(data, "displayName");
|
||||||
|
|
||||||
|
for (i, (_url, meta)) in videos.iter_mut().enumerate() {
|
||||||
|
if let Some(date_val) = upload_dates.get(i) {
|
||||||
|
meta.insert("upload_date".to_string(), (*date_val).clone());
|
||||||
|
} else if let Some(date_val) = upload_dates.first() {
|
||||||
|
meta.insert("upload_date".to_string(), (*date_val).clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(count_val) = view_counts.get(i) {
|
||||||
|
meta.insert("view_count".to_string(), (*count_val).clone());
|
||||||
|
} else if let Some(count_val) = view_counts.first() {
|
||||||
|
meta.insert("view_count".to_string(), (*count_val).clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(user_val) = usernames.get(i).or(usernames.first()) {
|
||||||
|
meta.insert("username".to_string(), (*user_val).clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(name_val) = display_names.get(i).or(display_names.first()) {
|
||||||
|
meta.insert("display_name".to_string(), (*name_val).clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
videos
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Extractor for SnapchatSpotlightExtractor {
|
||||||
|
fn category(&self) -> &str { "snapchat" }
|
||||||
|
fn subcategory(&self) -> &str { "spotlight" }
|
||||||
|
fn root(&self) -> &str { "https://www.snapchat.com" }
|
||||||
|
fn pattern(&self) -> &Regex { &self.pattern }
|
||||||
|
|
||||||
|
fn clone_extractor(&self) -> Box<dyn Extractor> {
|
||||||
|
Box::new(self.clone())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn initialize(&mut self, m: ExtractorMatch) -> Result<(), ExtractorError> {
|
||||||
|
if let Some(caps) = self.pattern.captures(&m.url) {
|
||||||
|
self.spotlight_id = caps.get(1).map(|m| m.as_str().to_string());
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn items(&mut self) -> Result<Vec<Message>, ExtractorError> {
|
||||||
|
let spotlight_id = self.spotlight_id.as_ref()
|
||||||
|
.ok_or_else(|| ExtractorError::NotInitialized("spotlight_id not set".to_string()))?;
|
||||||
|
|
||||||
|
let url = format!("https://www.snapchat.com/spotlight/{}", spotlight_id);
|
||||||
|
log::info!("Fetching Snapchat spotlight: {}", url);
|
||||||
|
|
||||||
|
let html = self.fetch_page(&url).await?;
|
||||||
|
|
||||||
|
let next_data = extract_next_data(&html)
|
||||||
|
.ok_or_else(|| ExtractorError::ParseError(
|
||||||
|
"Could not find __NEXT_DATA__ in page HTML".to_string()
|
||||||
|
))?;
|
||||||
|
|
||||||
|
let videos = self.extract_videos_from_next_data(&next_data);
|
||||||
|
|
||||||
|
if videos.is_empty() {
|
||||||
|
return Err(ExtractorError::ParseError(
|
||||||
|
"No video URLs found in spotlight data".to_string()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut messages = Vec::new();
|
||||||
|
|
||||||
|
// Directory message
|
||||||
|
let creator = videos.first()
|
||||||
|
.and_then(|(_, meta)| meta.get("username"))
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("unknown");
|
||||||
|
|
||||||
|
let dir_msg = Message::directory("")
|
||||||
|
.with_metadata("category", serde_json::json!("snapchat"))
|
||||||
|
.with_metadata("subcategory", serde_json::json!("spotlight"))
|
||||||
|
.with_metadata("title", serde_json::json!(format!("spotlight_{}", spotlight_id)))
|
||||||
|
.with_metadata("creator", serde_json::json!(creator));
|
||||||
|
messages.push(dir_msg);
|
||||||
|
|
||||||
|
for (video_url, meta) in &videos {
|
||||||
|
let filename = cdn_filename(video_url)
|
||||||
|
.unwrap_or_else(|| format!("{}.mp4", spotlight_id));
|
||||||
|
|
||||||
|
let mut msg = Message::url(video_url)
|
||||||
|
.with_filename(&filename);
|
||||||
|
|
||||||
|
for (key, val) in meta {
|
||||||
|
msg = msg.with_metadata(key, val.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
messages.push(msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
log::info!("Found {} video(s) in spotlight {}", videos.len(), spotlight_id);
|
||||||
|
Ok(messages)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// SnapchatProfileExtractor — public profile stories
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct SnapchatProfileExtractor {
|
||||||
|
pattern: Regex,
|
||||||
|
username: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SnapchatProfileExtractor {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
pattern: Regex::new(
|
||||||
|
r"(?:https?://)?(?:www\.)?snapchat\.com/add/([A-Za-z0-9._-]+)"
|
||||||
|
).expect("Failed to compile Snapchat profile pattern"),
|
||||||
|
username: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn create_client(&self) -> Result<reqwest::Client, ExtractorError> {
|
||||||
|
reqwest::Client::builder()
|
||||||
|
.user_agent(USER_AGENT)
|
||||||
|
.timeout(std::time::Duration::from_secs(30))
|
||||||
|
.redirect(reqwest::redirect::Policy::limited(10))
|
||||||
|
.build()
|
||||||
|
.map_err(|e| ExtractorError::ConfigError(e.to_string()))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn fetch_page(&self, url: &str) -> Result<String, ExtractorError> {
|
||||||
|
let client = self.create_client()?;
|
||||||
|
let response = client.get(url).send().await
|
||||||
|
.map_err(ExtractorError::RequestFailed)?;
|
||||||
|
|
||||||
|
let status = response.status();
|
||||||
|
if status.as_u16() == 404 {
|
||||||
|
return Err(ExtractorError::NotFound(format!("Profile not found: {}", url)));
|
||||||
|
}
|
||||||
|
if !status.is_success() {
|
||||||
|
return Err(ExtractorError::HttpError(format!("HTTP {}", status.as_u16())));
|
||||||
|
}
|
||||||
|
|
||||||
|
response.text().await
|
||||||
|
.map_err(|e| ExtractorError::ParseError(e.to_string()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Extractor for SnapchatProfileExtractor {
|
||||||
|
fn category(&self) -> &str { "snapchat" }
|
||||||
|
fn subcategory(&self) -> &str { "profile" }
|
||||||
|
fn root(&self) -> &str { "https://www.snapchat.com" }
|
||||||
|
fn pattern(&self) -> &Regex { &self.pattern }
|
||||||
|
|
||||||
|
fn clone_extractor(&self) -> Box<dyn Extractor> {
|
||||||
|
Box::new(self.clone())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn initialize(&mut self, m: ExtractorMatch) -> Result<(), ExtractorError> {
|
||||||
|
if let Some(caps) = self.pattern.captures(&m.url) {
|
||||||
|
self.username = caps.get(1).map(|m| m.as_str().to_string());
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn items(&mut self) -> Result<Vec<Message>, ExtractorError> {
|
||||||
|
let username = self.username.as_ref()
|
||||||
|
.ok_or_else(|| ExtractorError::NotInitialized("username not set".to_string()))?;
|
||||||
|
|
||||||
|
let url = format!("https://www.snapchat.com/add/{}", username);
|
||||||
|
log::info!("Fetching Snapchat profile: {}", url);
|
||||||
|
|
||||||
|
let html = self.fetch_page(&url).await?;
|
||||||
|
|
||||||
|
let next_data = extract_next_data(&html)
|
||||||
|
.ok_or_else(|| ExtractorError::ParseError(
|
||||||
|
"Could not find __NEXT_DATA__ in page HTML. Profile may be private or empty.".to_string()
|
||||||
|
))?;
|
||||||
|
|
||||||
|
// Extract all media URLs from the profile data
|
||||||
|
let mut media_urls: Vec<String> = Vec::new();
|
||||||
|
|
||||||
|
// Look for contentUrl (videos)
|
||||||
|
for val in find_all_values(&next_data, "contentUrl") {
|
||||||
|
if let Some(url) = val.as_str() {
|
||||||
|
if url.contains("sc-cdn.net") || url.contains(".mp4") {
|
||||||
|
media_urls.push(url.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look for mediaUrl (alternate/additional media)
|
||||||
|
for val in find_all_values(&next_data, "mediaUrl") {
|
||||||
|
if let Some(url) = val.as_str() {
|
||||||
|
if (url.contains("sc-cdn.net") || url.contains(".mp4") || url.contains(".jpg") || url.contains(".png"))
|
||||||
|
&& !media_urls.contains(&url.to_string())
|
||||||
|
{
|
||||||
|
media_urls.push(url.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look for snapMediaUrl (story media)
|
||||||
|
for val in find_all_values(&next_data, "snapMediaUrl") {
|
||||||
|
if let Some(url) = val.as_str() {
|
||||||
|
if !media_urls.contains(&url.to_string()) {
|
||||||
|
media_urls.push(url.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look for thumbnailUrl (image previews)
|
||||||
|
for val in find_all_values(&next_data, "thumbnailUrl") {
|
||||||
|
if let Some(url) = val.as_str() {
|
||||||
|
if !media_urls.contains(&url.to_string()) {
|
||||||
|
media_urls.push(url.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut messages = Vec::new();
|
||||||
|
|
||||||
|
// Directory message
|
||||||
|
let display_name = find_all_values(&next_data, "displayName")
|
||||||
|
.first()
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or(username.as_str())
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
let dir_msg = Message::directory("")
|
||||||
|
.with_metadata("category", serde_json::json!("snapchat"))
|
||||||
|
.with_metadata("subcategory", serde_json::json!("profile"))
|
||||||
|
.with_metadata("title", serde_json::json!(&display_name))
|
||||||
|
.with_metadata("username", serde_json::json!(username));
|
||||||
|
messages.push(dir_msg);
|
||||||
|
|
||||||
|
if media_urls.is_empty() {
|
||||||
|
log::warn!("No media found on profile {}. It may be private or have no public stories.", username);
|
||||||
|
return Ok(messages);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i, media_url) in media_urls.iter().enumerate() {
|
||||||
|
let filename = cdn_filename(media_url)
|
||||||
|
.unwrap_or_else(|| {
|
||||||
|
let ext = if media_url.contains(".mp4") { "mp4" }
|
||||||
|
else if media_url.contains(".jpg") || media_url.contains(".jpeg") { "jpg" }
|
||||||
|
else if media_url.contains(".png") { "png" }
|
||||||
|
else { "mp4" };
|
||||||
|
format!("{}_{:03}.{}", username, i + 1, ext)
|
||||||
|
});
|
||||||
|
|
||||||
|
let msg = Message::url(media_url)
|
||||||
|
.with_filename(&filename)
|
||||||
|
.with_metadata("username", serde_json::json!(username))
|
||||||
|
.with_metadata("num", serde_json::json!(i + 1));
|
||||||
|
|
||||||
|
messages.push(msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
log::info!("Found {} media item(s) on profile {}", media_urls.len(), username);
|
||||||
|
Ok(messages)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_spotlight_pattern() {
|
||||||
|
let ext = SnapchatSpotlightExtractor::new();
|
||||||
|
assert!(ext.pattern.is_match("https://www.snapchat.com/spotlight/ABC123_def"));
|
||||||
|
assert!(ext.pattern.is_match("https://snapchat.com/spotlight/ABC123"));
|
||||||
|
assert!(ext.pattern.is_match("http://www.snapchat.com/spotlight/test-id_123"));
|
||||||
|
assert!(!ext.pattern.is_match("https://snapchat.com/add/username"));
|
||||||
|
assert!(!ext.pattern.is_match("https://snapchat.com/"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_profile_pattern() {
|
||||||
|
let ext = SnapchatProfileExtractor::new();
|
||||||
|
assert!(ext.pattern.is_match("https://www.snapchat.com/add/john_doe"));
|
||||||
|
assert!(ext.pattern.is_match("https://snapchat.com/add/user.name"));
|
||||||
|
assert!(ext.pattern.is_match("http://www.snapchat.com/add/test-user"));
|
||||||
|
assert!(!ext.pattern.is_match("https://snapchat.com/spotlight/ABC123"));
|
||||||
|
assert!(!ext.pattern.is_match("https://snapchat.com/"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_next_data() {
|
||||||
|
let html = r#"<html><head><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"story":{"contentUrl":"https://cf-st.sc-cdn.net/d/HASH123.27.IRZXSOY?mo=test"}}}}</script></head></html>"#;
|
||||||
|
let data = extract_next_data(html);
|
||||||
|
assert!(data.is_some());
|
||||||
|
let data = data.unwrap();
|
||||||
|
let urls = find_all_values(&data, "contentUrl");
|
||||||
|
assert_eq!(urls.len(), 1);
|
||||||
|
assert_eq!(urls[0].as_str().unwrap(), "https://cf-st.sc-cdn.net/d/HASH123.27.IRZXSOY?mo=test");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_next_data_missing() {
|
||||||
|
let html = r#"<html><head></head><body>No next data here</body></html>"#;
|
||||||
|
assert!(extract_next_data(html).is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_find_all_values() {
|
||||||
|
let json: Value = serde_json::json!({
|
||||||
|
"a": {
|
||||||
|
"contentUrl": "url1",
|
||||||
|
"nested": {
|
||||||
|
"contentUrl": "url2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"b": [
|
||||||
|
{"contentUrl": "url3"},
|
||||||
|
{"other": "ignored"}
|
||||||
|
]
|
||||||
|
});
|
||||||
|
let urls = find_all_values(&json, "contentUrl");
|
||||||
|
assert_eq!(urls.len(), 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_cdn_filename() {
|
||||||
|
assert_eq!(
|
||||||
|
cdn_filename("https://cf-st.sc-cdn.net/d/ABCDEF.27.IRZXSOY?mo=test&uc=46"),
|
||||||
|
Some("ABCDEF.mp4".to_string())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
cdn_filename("https://bolt-gcdn.sc-cdn.net/video/HASH123.27.IRZXSOY?mo=test"),
|
||||||
|
Some("HASH123.mp4".to_string())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_cdn_filename_no_hash() {
|
||||||
|
// Should still extract something from normal URLs
|
||||||
|
assert!(cdn_filename("https://example.com/some/path/file.mp4").is_some());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_spotlight_extract_videos() {
|
||||||
|
let ext = SnapchatSpotlightExtractor::new();
|
||||||
|
let data: Value = serde_json::json!({
|
||||||
|
"props": {
|
||||||
|
"pageProps": {
|
||||||
|
"story": {
|
||||||
|
"contentUrl": "https://cf-st.sc-cdn.net/d/ABC.27.IRZXSOY?mo=test",
|
||||||
|
"uploadDateMs": 1700000000000_u64,
|
||||||
|
"viewCount": 50000,
|
||||||
|
"username": "testuser",
|
||||||
|
"displayName": "Test User"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
let videos = ext.extract_videos_from_next_data(&data);
|
||||||
|
assert_eq!(videos.len(), 1);
|
||||||
|
assert!(videos[0].0.contains("sc-cdn.net"));
|
||||||
|
assert!(videos[0].1.contains_key("username"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_spotlight_mediaurl_fallback() {
|
||||||
|
let ext = SnapchatSpotlightExtractor::new();
|
||||||
|
let data: Value = serde_json::json!({
|
||||||
|
"props": {
|
||||||
|
"pageProps": {
|
||||||
|
"media": {
|
||||||
|
"mediaUrl": "https://cf-st.sc-cdn.net/d/FALLBACK.27.IRZXSOY?mo=x"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
let videos = ext.extract_videos_from_next_data(&data);
|
||||||
|
assert_eq!(videos.len(), 1);
|
||||||
|
assert!(videos[0].0.contains("FALLBACK"));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,40 +1,462 @@
|
|||||||
//! XenForo extractor implementation
|
//! XenForo extractor implementation
|
||||||
//!
|
//!
|
||||||
//! Supports XenForo forums (simpcity.cr, nudostar.com/forum, etc.)
|
//! Supports XenForo forums (simpcity.cr, nudostar.com/forum, etc.)
|
||||||
|
//! Extracts images and videos from thread posts with pagination support.
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
|
use std::collections::{HashMap, HashSet};
|
||||||
|
|
||||||
use crate::extractor::{
|
use crate::extractor::{
|
||||||
Extractor, ExtractorError, ExtractorMatch, HttpClient, Message,
|
Extractor, ExtractorError, ExtractorMatch, Message, MessageKind,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct XenforoPostExtractor {
|
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
|
||||||
pattern: Regex,
|
|
||||||
category: String,
|
/// Known XenForo domains and their root URLs
|
||||||
subcategory: String,
|
fn root_for_domain(domain: &str) -> String {
|
||||||
root_url: String,
|
match domain {
|
||||||
post_id: Option<String>,
|
"simpcity.cr" | "simpcity.su" => format!("https://{}", domain),
|
||||||
client: HttpClient,
|
"nudostar.com/forum" => "https://nudostar.com/forum".to_string(),
|
||||||
|
"allthefallen.moe/forum" => "https://allthefallen.moe/forum".to_string(),
|
||||||
|
"celebforum.to" => "https://celebforum.to".to_string(),
|
||||||
|
"forums.socialmediagirls.com" => "https://forums.socialmediagirls.com".to_string(),
|
||||||
|
_ => format!("https://{}", domain),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Build a cookie header string from a HashMap
|
||||||
|
fn cookie_header(cookies: &HashMap<String, String>) -> String {
|
||||||
|
cookies.iter()
|
||||||
|
.map(|(k, v)| format!("{}={}", k, v))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join("; ")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract media URLs from HTML content.
|
||||||
|
///
|
||||||
|
/// Finds all media by matching multiple patterns:
|
||||||
|
/// - `<img class="bbImage" src="...">` — inline images
|
||||||
|
/// - `<video src="...">` — inline videos
|
||||||
|
/// - `<a href=".../attachments/...">` — file attachments
|
||||||
|
/// - `<iframe src="...">` — embedded media
|
||||||
|
/// - `loadMedia(this, '...')` — lazy-loaded embeds
|
||||||
|
fn extract_media_from_html(html: &str, root_url: &str) -> Vec<String> {
|
||||||
|
let mut urls = Vec::new();
|
||||||
|
|
||||||
|
// 1. bbImage: <img ... class="bbImage" ... src="URL"> or data-url="URL"
|
||||||
|
let img_re = Regex::new(r#"<img[^>]+class="bbImage[^"]*"[^>]*(?:data-url|src)="([^"]+)"|<img[^>]*(?:data-url|src)="([^"]+)"[^>]*class="bbImage[^"]*""#).unwrap();
|
||||||
|
for caps in img_re.captures_iter(html) {
|
||||||
|
if let Some(m) = caps.get(1).or(caps.get(2)) {
|
||||||
|
urls.push(m.as_str().to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Video src
|
||||||
|
let video_re = Regex::new(r#"<video[^>]+src="([^"]+)"#).unwrap();
|
||||||
|
for caps in video_re.captures_iter(html) {
|
||||||
|
if let Some(m) = caps.get(1) {
|
||||||
|
urls.push(m.as_str().to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Attachments
|
||||||
|
let attach_re = Regex::new(r#"<a[^>]+href="([^"]+/attachments/[^"]+)"#).unwrap();
|
||||||
|
for caps in attach_re.captures_iter(html) {
|
||||||
|
if let Some(m) = caps.get(1) {
|
||||||
|
urls.push(m.as_str().to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Iframes
|
||||||
|
let iframe_re = Regex::new(r#"<iframe[^>]+src="([^"]+)"#).unwrap();
|
||||||
|
for caps in iframe_re.captures_iter(html) {
|
||||||
|
if let Some(m) = caps.get(1) {
|
||||||
|
urls.push(m.as_str().to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5. Lazy-loaded media
|
||||||
|
let lazy_re = Regex::new(r#"loadMedia\(this,\s*'([^']+)'"#).unwrap();
|
||||||
|
for caps in lazy_re.captures_iter(html) {
|
||||||
|
if let Some(m) = caps.get(1) {
|
||||||
|
urls.push(m.as_str().to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalize and filter
|
||||||
|
urls.into_iter()
|
||||||
|
.filter_map(|u| normalize_url(&u, root_url))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Normalize a URL: resolve relative paths, upgrade protocol, skip junk
|
||||||
|
fn normalize_url(url: &str, root_url: &str) -> Option<String> {
|
||||||
|
// Skip smilies, avatars, style assets, base64 data URIs
|
||||||
|
if url.contains("/styles/") || url.contains("/smilies/")
|
||||||
|
|| url.contains("data/avatars/") || url.contains("data:image")
|
||||||
|
|| url.contains("/icons/") || url.contains("reaction-sprite")
|
||||||
|
{
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut u = url.to_string();
|
||||||
|
|
||||||
|
if u.starts_with("//") {
|
||||||
|
u = format!("https:{}", u);
|
||||||
|
} else if u.starts_with('/') {
|
||||||
|
u = format!("{}{}", root_url, u);
|
||||||
|
}
|
||||||
|
|
||||||
|
if !u.starts_with("http://") && !u.starts_with("https://") {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Upgrade .md.jpg thumbnails to full size (simpcity CDN pattern)
|
||||||
|
u = upgrade_thumbnail(&u);
|
||||||
|
|
||||||
|
Some(u)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Upgrade simpcity CDN thumbnail URLs to full-size
|
||||||
|
/// e.g. image.md.jpg -> image.jpg
|
||||||
|
fn upgrade_thumbnail(url: &str) -> String {
|
||||||
|
let re = Regex::new(r"\.md\.(jpg|jpeg|png|gif|webp)(\?|$)").unwrap();
|
||||||
|
re.replace(url, ".$1$2").into_owned()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decode common HTML entities
|
||||||
|
fn decode_html_entities(s: &str) -> String {
|
||||||
|
s.replace(" ", " ")
|
||||||
|
.replace("&", "&")
|
||||||
|
.replace("<", "<")
|
||||||
|
.replace(">", ">")
|
||||||
|
.replace(""", "\"")
|
||||||
|
.replace("'", "'")
|
||||||
|
.replace("'", "'")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract the thread title from the page HTML
|
||||||
|
fn extract_thread_title(html: &str) -> Option<String> {
|
||||||
|
let re = Regex::new(r#"<h1[^>]*class="[^"]*p-title-value[^"]*"[^>]*>(.*?)</h1>"#).ok()?;
|
||||||
|
re.captures(html)
|
||||||
|
.and_then(|c| c.get(1))
|
||||||
|
.map(|m| {
|
||||||
|
// Strip inner tags like <span>
|
||||||
|
let tag_re = Regex::new(r"<[^>]+>").unwrap();
|
||||||
|
let title = tag_re.replace_all(m.as_str().trim(), "").trim().to_string();
|
||||||
|
// Decode HTML entities
|
||||||
|
decode_html_entities(&title)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Find the next page URL from pagination
|
||||||
|
fn find_next_page(html: &str) -> Option<String> {
|
||||||
|
// Handle both attribute orderings: class before href, or href before class
|
||||||
|
let re = Regex::new(
|
||||||
|
r#"<a[^>]*href="([^"]+)"[^>]*class="[^"]*pageNav-jump--next[^"]*"|<a[^>]*class="[^"]*pageNav-jump--next[^"]*"[^>]*href="([^"]+)""#
|
||||||
|
).ok()?;
|
||||||
|
re.captures(html).and_then(|c| {
|
||||||
|
c.get(1).or(c.get(2))
|
||||||
|
}).map(|m| {
|
||||||
|
m.as_str().replace("&", "&")
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract individual post blocks from the page HTML.
|
||||||
|
///
|
||||||
|
/// XenForo posts are `<article>` elements with `data-content="post-NNNNN"`.
|
||||||
|
/// We split the HTML at each post boundary and extract the content between them.
|
||||||
|
fn extract_posts(html: &str) -> Vec<(String, String)> {
|
||||||
|
let boundary_re = Regex::new(r#"data-content="post-(\d+)""#)
|
||||||
|
.expect("Failed to compile post boundary regex");
|
||||||
|
|
||||||
|
let matches: Vec<_> = boundary_re.captures_iter(html)
|
||||||
|
.filter_map(|c| {
|
||||||
|
let full = c.get(0)?;
|
||||||
|
let id = c.get(1)?.as_str().to_string();
|
||||||
|
Some((id, full.start()))
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if matches.is_empty() {
|
||||||
|
return Vec::new();
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut posts = Vec::new();
|
||||||
|
for i in 0..matches.len() {
|
||||||
|
let (ref id, start) = matches[i];
|
||||||
|
let end = if i + 1 < matches.len() {
|
||||||
|
matches[i + 1].1
|
||||||
|
} else {
|
||||||
|
html.len()
|
||||||
|
};
|
||||||
|
let post_html = &html[start..end];
|
||||||
|
posts.push((id.clone(), post_html.to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
posts
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// XenforoThreadExtractor
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
pub struct XenforoThreadExtractor {
|
pub struct XenforoThreadExtractor {
|
||||||
pattern: Regex,
|
pattern: Regex,
|
||||||
category: String,
|
category: String,
|
||||||
subcategory: String,
|
subcategory: String,
|
||||||
root_url: String,
|
root_url: String,
|
||||||
|
domain: Option<String>,
|
||||||
|
thread_path: Option<String>,
|
||||||
thread_id: Option<String>,
|
thread_id: Option<String>,
|
||||||
page: Option<i64>,
|
page: Option<i64>,
|
||||||
client: HttpClient,
|
cookies: HashMap<String, String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct XenforoForumExtractor {
|
impl XenforoThreadExtractor {
|
||||||
|
pub fn new() -> Result<Self, ExtractorError> {
|
||||||
|
let pattern = Regex::new(
|
||||||
|
r"(?:https?://)?(?:www\.)?(simpcity\.cr|simpcity\.su|nudostar\.com/forum|allthefallen\.moe/forum|celebforum\.to|titsintops\.com/phpBB2|forums\.socialmediagirls\.com)(/(?:index\.php\?)?threads/(?:[^/?#]+\.)?(\d+))(?:/page-(\d+))?"
|
||||||
|
).map_err(|e| ExtractorError::ConfigError(e.to_string()))?;
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
pattern,
|
||||||
|
category: "xenforo".to_string(),
|
||||||
|
subcategory: "thread".to_string(),
|
||||||
|
root_url: "https://simpcity.cr".to_string(),
|
||||||
|
domain: None,
|
||||||
|
thread_path: None,
|
||||||
|
thread_id: None,
|
||||||
|
page: None,
|
||||||
|
cookies: HashMap::new(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn create_client(&self) -> Result<reqwest::Client, ExtractorError> {
|
||||||
|
reqwest::Client::builder()
|
||||||
|
.user_agent(USER_AGENT)
|
||||||
|
.timeout(std::time::Duration::from_secs(30))
|
||||||
|
.redirect(reqwest::redirect::Policy::limited(10))
|
||||||
|
.build()
|
||||||
|
.map_err(|e| ExtractorError::ConfigError(e.to_string()))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn fetch_page(&self, url: &str) -> Result<String, ExtractorError> {
|
||||||
|
let client = self.create_client()?;
|
||||||
|
let mut request = client.get(url);
|
||||||
|
|
||||||
|
if !self.cookies.is_empty() {
|
||||||
|
request = request.header("Cookie", cookie_header(&self.cookies));
|
||||||
|
}
|
||||||
|
|
||||||
|
let response = request.send().await
|
||||||
|
.map_err(ExtractorError::RequestFailed)?;
|
||||||
|
|
||||||
|
let status = response.status();
|
||||||
|
if status.as_u16() == 403 || status.as_u16() == 401 {
|
||||||
|
return Err(ExtractorError::ConfigError(format!(
|
||||||
|
"Authentication required (HTTP {}). Set cookies in config: \
|
||||||
|
extractor.xenforo.cookies.xf_user = \"your_cookie_value\"",
|
||||||
|
status.as_u16()
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
if !status.is_success() {
|
||||||
|
return Err(ExtractorError::HttpError(format!("HTTP {}", status.as_u16())));
|
||||||
|
}
|
||||||
|
|
||||||
|
response.text().await
|
||||||
|
.map_err(|e| ExtractorError::ParseError(e.to_string()))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn extract_thread(&self) -> Result<Vec<Message>, ExtractorError> {
|
||||||
|
let thread_path = self.thread_path.as_ref()
|
||||||
|
.ok_or_else(|| ExtractorError::NotInitialized("thread_path not set".to_string()))?;
|
||||||
|
|
||||||
|
let mut messages = Vec::new();
|
||||||
|
let mut seen_urls: HashSet<String> = HashSet::new();
|
||||||
|
|
||||||
|
// Build the starting URL
|
||||||
|
let start_url = if let Some(page) = self.page {
|
||||||
|
format!("{}{}/page-{}", self.root_url, thread_path, page)
|
||||||
|
} else {
|
||||||
|
format!("{}{}/", self.root_url, thread_path)
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut current_url = Some(start_url);
|
||||||
|
let mut page_num = self.page.unwrap_or(1);
|
||||||
|
let mut total_media = 0;
|
||||||
|
|
||||||
|
while let Some(url) = current_url.take() {
|
||||||
|
log::info!("Fetching page {} of thread: {}", page_num, url);
|
||||||
|
|
||||||
|
let html = self.fetch_page(&url).await?;
|
||||||
|
|
||||||
|
// Extract thread title on first page for the directory message
|
||||||
|
if page_num <= 1 || (self.page.is_some() && page_num == self.page.unwrap()) {
|
||||||
|
let title = extract_thread_title(&html)
|
||||||
|
.unwrap_or_else(|| "unknown".to_string());
|
||||||
|
log::info!("Thread title: {}", title);
|
||||||
|
|
||||||
|
let mut dir_msg = Message::directory("");
|
||||||
|
dir_msg.metadata.insert("thread_id".to_string(),
|
||||||
|
serde_json::json!(self.thread_id.as_deref().unwrap_or("0")));
|
||||||
|
dir_msg.metadata.insert("title".to_string(), serde_json::json!(title));
|
||||||
|
dir_msg.metadata.insert("category".to_string(), serde_json::json!("xenforo"));
|
||||||
|
messages.push(dir_msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract posts and their media
|
||||||
|
let posts = extract_posts(&html);
|
||||||
|
log::info!("Found {} posts on page {}", posts.len(), page_num);
|
||||||
|
|
||||||
|
for (post_id, post_html) in &posts {
|
||||||
|
let media_urls = extract_media_from_html(post_html, &self.root_url);
|
||||||
|
|
||||||
|
for media_url in media_urls {
|
||||||
|
if seen_urls.contains(&media_url) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
seen_urls.insert(media_url.clone());
|
||||||
|
|
||||||
|
let msg = Message::url(&media_url)
|
||||||
|
.with_metadata("post_id", serde_json::json!(post_id))
|
||||||
|
.with_metadata("thread_id",
|
||||||
|
serde_json::json!(self.thread_id.as_deref().unwrap_or("0")));
|
||||||
|
|
||||||
|
// Try to extract a filename from the URL
|
||||||
|
if let Some(filename) = url_filename(&media_url) {
|
||||||
|
messages.push(msg.with_filename(filename));
|
||||||
|
} else {
|
||||||
|
messages.push(msg);
|
||||||
|
}
|
||||||
|
total_media += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no posts found at all, try a simpler fallback: just extract all media from the page
|
||||||
|
if posts.is_empty() {
|
||||||
|
log::warn!("No post blocks found on page {} — trying full-page scan", page_num);
|
||||||
|
let media_urls = extract_media_from_html(&html, &self.root_url);
|
||||||
|
for media_url in media_urls {
|
||||||
|
if seen_urls.contains(&media_url) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
seen_urls.insert(media_url.clone());
|
||||||
|
|
||||||
|
let msg = Message::url(&media_url);
|
||||||
|
if let Some(filename) = url_filename(&media_url) {
|
||||||
|
messages.push(msg.with_filename(filename));
|
||||||
|
} else {
|
||||||
|
messages.push(msg);
|
||||||
|
}
|
||||||
|
total_media += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for next page
|
||||||
|
if let Some(next_href) = find_next_page(&html) {
|
||||||
|
let next_url = if next_href.starts_with("http") {
|
||||||
|
next_href
|
||||||
|
} else {
|
||||||
|
format!("{}{}", self.root_url, next_href)
|
||||||
|
};
|
||||||
|
current_url = Some(next_url);
|
||||||
|
page_num += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log::info!("Extracted {} media URLs across {} pages", total_media, page_num);
|
||||||
|
Ok(messages)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Try to extract a usable filename from a URL
|
||||||
|
fn url_filename(url: &str) -> Option<String> {
|
||||||
|
let path = url::Url::parse(url).ok()?.path().to_string();
|
||||||
|
let segment = path.rsplit('/').next()?;
|
||||||
|
if segment.is_empty() || !segment.contains('.') {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
// URL-decode the filename
|
||||||
|
let decoded = urlencoding::decode(segment).ok()?;
|
||||||
|
Some(decoded.into_owned())
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for XenforoThreadExtractor {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new().expect("Failed to create XenforoThreadExtractor")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Clone for XenforoThreadExtractor {
|
||||||
|
fn clone(&self) -> Self {
|
||||||
|
Self {
|
||||||
|
pattern: self.pattern.clone(),
|
||||||
|
category: self.category.clone(),
|
||||||
|
subcategory: self.subcategory.clone(),
|
||||||
|
root_url: self.root_url.clone(),
|
||||||
|
domain: self.domain.clone(),
|
||||||
|
thread_path: self.thread_path.clone(),
|
||||||
|
thread_id: self.thread_id.clone(),
|
||||||
|
page: self.page,
|
||||||
|
cookies: self.cookies.clone(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Extractor for XenforoThreadExtractor {
|
||||||
|
fn category(&self) -> &str { &self.category }
|
||||||
|
fn subcategory(&self) -> &str { &self.subcategory }
|
||||||
|
fn root(&self) -> &str { &self.root_url }
|
||||||
|
fn pattern(&self) -> &Regex { &self.pattern }
|
||||||
|
fn clone_extractor(&self) -> Box<dyn Extractor> { Box::new(self.clone()) }
|
||||||
|
|
||||||
|
async fn initialize(&mut self, m: ExtractorMatch) -> Result<(), ExtractorError> {
|
||||||
|
if let Some(captures) = self.pattern.captures(&m.url) {
|
||||||
|
if let Some(domain) = captures.get(1) {
|
||||||
|
self.domain = Some(domain.as_str().to_string());
|
||||||
|
self.root_url = root_for_domain(domain.as_str());
|
||||||
|
}
|
||||||
|
if let Some(path) = captures.get(2) {
|
||||||
|
self.thread_path = Some(path.as_str().to_string());
|
||||||
|
}
|
||||||
|
if let Some(id) = captures.get(3) {
|
||||||
|
self.thread_id = Some(id.as_str().to_string());
|
||||||
|
}
|
||||||
|
if let Some(page) = captures.get(4) {
|
||||||
|
self.page = Some(page.as_str().parse::<i64>().unwrap_or(1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
log::info!("Initialized XenForo thread extractor: path={:?} id={:?} page={:?}",
|
||||||
|
self.thread_path, self.thread_id, self.page);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn items(&mut self) -> Result<Vec<Message>, ExtractorError> {
|
||||||
|
self.extract_thread().await
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_cookies(&mut self, cookies: HashMap<String, String>) {
|
||||||
|
self.cookies = cookies;
|
||||||
|
log::debug!("XenForo cookies set: {} entries", self.cookies.len());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// XenforoPostExtractor
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
pub struct XenforoPostExtractor {
|
||||||
pattern: Regex,
|
pattern: Regex,
|
||||||
category: String,
|
category: String,
|
||||||
subcategory: String,
|
subcategory: String,
|
||||||
root_url: String,
|
root_url: String,
|
||||||
forum_id: Option<String>,
|
domain: Option<String>,
|
||||||
client: HttpClient,
|
post_id: Option<String>,
|
||||||
|
post_url_prefix: Option<String>,
|
||||||
|
cookies: HashMap<String, String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl XenforoPostExtractor {
|
impl XenforoPostExtractor {
|
||||||
@@ -48,27 +470,79 @@ impl XenforoPostExtractor {
|
|||||||
category: "xenforo".to_string(),
|
category: "xenforo".to_string(),
|
||||||
subcategory: "post".to_string(),
|
subcategory: "post".to_string(),
|
||||||
root_url: "https://simpcity.cr".to_string(),
|
root_url: "https://simpcity.cr".to_string(),
|
||||||
|
domain: None,
|
||||||
post_id: None,
|
post_id: None,
|
||||||
client: HttpClient::builder()
|
post_url_prefix: None,
|
||||||
.user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
cookies: HashMap::new(),
|
||||||
.build()
|
|
||||||
.map_err(|e| ExtractorError::ConfigError(e.to_string()))?,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn create_client(&self) -> Result<reqwest::Client, ExtractorError> {
|
||||||
|
reqwest::Client::builder()
|
||||||
|
.user_agent(USER_AGENT)
|
||||||
|
.timeout(std::time::Duration::from_secs(30))
|
||||||
|
.redirect(reqwest::redirect::Policy::limited(10))
|
||||||
|
.build()
|
||||||
|
.map_err(|e| ExtractorError::ConfigError(e.to_string()))
|
||||||
|
}
|
||||||
|
|
||||||
async fn extract_post(&self) -> Result<Vec<Message>, ExtractorError> {
|
async fn extract_post(&self) -> Result<Vec<Message>, ExtractorError> {
|
||||||
let post_id = self.post_id.as_ref()
|
let post_id = self.post_id.as_ref()
|
||||||
.ok_or_else(|| ExtractorError::NotInitialized("post_id not set".to_string()))?;
|
.ok_or_else(|| ExtractorError::NotInitialized("post_id not set".to_string()))?;
|
||||||
|
|
||||||
log::info!("Extracting XenForo post: {}", post_id);
|
log::info!("Extracting XenForo post: {}", post_id);
|
||||||
|
|
||||||
let mut messages = Vec::new();
|
// Fetch the post page
|
||||||
|
let url = format!("{}/posts/{}/", self.root_url, post_id);
|
||||||
|
let client = self.create_client()?;
|
||||||
|
let mut request = client.get(&url);
|
||||||
|
if !self.cookies.is_empty() {
|
||||||
|
request = request.header("Cookie", cookie_header(&self.cookies));
|
||||||
|
}
|
||||||
|
|
||||||
|
let response = request.send().await
|
||||||
|
.map_err(ExtractorError::RequestFailed)?;
|
||||||
|
|
||||||
|
let status = response.status();
|
||||||
|
if !status.is_success() {
|
||||||
|
return Err(ExtractorError::HttpError(format!("HTTP {}", status.as_u16())));
|
||||||
|
}
|
||||||
|
|
||||||
|
let html = response.text().await
|
||||||
|
.map_err(|e| ExtractorError::ParseError(e.to_string()))?;
|
||||||
|
|
||||||
|
let mut messages = Vec::new();
|
||||||
|
let mut seen_urls: HashSet<String> = HashSet::new();
|
||||||
|
|
||||||
|
// Directory message
|
||||||
let mut dir_msg = Message::directory("");
|
let mut dir_msg = Message::directory("");
|
||||||
dir_msg.metadata.insert("post_id".to_string(), serde_json::json!(post_id.parse::<i64>().unwrap_or(0)));
|
dir_msg.metadata.insert("post_id".to_string(), serde_json::json!(post_id));
|
||||||
messages.push(dir_msg);
|
messages.push(dir_msg);
|
||||||
|
|
||||||
log::info!("Found XenForo post {}", post_id);
|
// Try to find just the target post
|
||||||
|
let posts = extract_posts(&html);
|
||||||
|
let target_html = posts.iter()
|
||||||
|
.find(|(id, _)| id == post_id)
|
||||||
|
.map(|(_, content)| content.as_str())
|
||||||
|
.unwrap_or(&html);
|
||||||
|
|
||||||
|
let media_urls = extract_media_from_html(target_html, &self.root_url);
|
||||||
|
for media_url in media_urls {
|
||||||
|
if seen_urls.contains(&media_url) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
seen_urls.insert(media_url.clone());
|
||||||
|
|
||||||
|
let msg = Message::url(&media_url)
|
||||||
|
.with_metadata("post_id", serde_json::json!(post_id));
|
||||||
|
if let Some(filename) = url_filename(&media_url) {
|
||||||
|
messages.push(msg.with_filename(filename));
|
||||||
|
} else {
|
||||||
|
messages.push(msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log::info!("Extracted {} media URLs from post {}", messages.len() - 1, post_id);
|
||||||
Ok(messages)
|
Ok(messages)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -86,11 +560,10 @@ impl Clone for XenforoPostExtractor {
|
|||||||
category: self.category.clone(),
|
category: self.category.clone(),
|
||||||
subcategory: self.subcategory.clone(),
|
subcategory: self.subcategory.clone(),
|
||||||
root_url: self.root_url.clone(),
|
root_url: self.root_url.clone(),
|
||||||
|
domain: self.domain.clone(),
|
||||||
post_id: self.post_id.clone(),
|
post_id: self.post_id.clone(),
|
||||||
client: HttpClient::builder()
|
post_url_prefix: self.post_url_prefix.clone(),
|
||||||
.user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
cookies: self.cookies.clone(),
|
||||||
.build()
|
|
||||||
.expect("Failed to create HTTP client"),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -105,7 +578,14 @@ impl Extractor for XenforoPostExtractor {
|
|||||||
|
|
||||||
async fn initialize(&mut self, m: ExtractorMatch) -> Result<(), ExtractorError> {
|
async fn initialize(&mut self, m: ExtractorMatch) -> Result<(), ExtractorError> {
|
||||||
if let Some(captures) = self.pattern.captures(&m.url) {
|
if let Some(captures) = self.pattern.captures(&m.url) {
|
||||||
if let Some(id) = captures.get(2) {
|
if let Some(domain) = captures.get(1) {
|
||||||
|
self.domain = Some(domain.as_str().to_string());
|
||||||
|
self.root_url = root_for_domain(domain.as_str());
|
||||||
|
}
|
||||||
|
if let Some(prefix) = captures.get(2) {
|
||||||
|
self.post_url_prefix = Some(prefix.as_str().to_string());
|
||||||
|
}
|
||||||
|
if let Some(id) = captures.get(3) {
|
||||||
self.post_id = Some(id.as_str().to_string());
|
self.post_id = Some(id.as_str().to_string());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -115,91 +595,24 @@ impl Extractor for XenforoPostExtractor {
|
|||||||
async fn items(&mut self) -> Result<Vec<Message>, ExtractorError> {
|
async fn items(&mut self) -> Result<Vec<Message>, ExtractorError> {
|
||||||
self.extract_post().await
|
self.extract_post().await
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl XenforoThreadExtractor {
|
fn set_cookies(&mut self, cookies: HashMap<String, String>) {
|
||||||
pub fn new() -> Result<Self, ExtractorError> {
|
self.cookies = cookies;
|
||||||
let pattern = Regex::new(
|
|
||||||
r"(?:https?://)?(?:www\.)?(simpcity\.cr|simpcity\.su|nudostar\.com/forum|allthefallen\.moe/forum|celebforum\.to|titsintops\.com/phpBB2|forums\.socialmediagirls\.com)(/(?:index\.php\?)?threads/(?:[^/?#]+\.)?(\d+))(?:/page-(\d+))?"
|
|
||||||
).map_err(|e| ExtractorError::ConfigError(e.to_string()))?;
|
|
||||||
|
|
||||||
Ok(Self {
|
|
||||||
pattern,
|
|
||||||
category: "xenforo".to_string(),
|
|
||||||
subcategory: "thread".to_string(),
|
|
||||||
root_url: "https://simpcity.cr".to_string(),
|
|
||||||
thread_id: None,
|
|
||||||
page: None,
|
|
||||||
client: HttpClient::builder()
|
|
||||||
.user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
|
||||||
.build()
|
|
||||||
.map_err(|e| ExtractorError::ConfigError(e.to_string()))?,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn extract_thread(&self) -> Result<Vec<Message>, ExtractorError> {
|
|
||||||
let thread_id = self.thread_id.as_ref()
|
|
||||||
.ok_or_else(|| ExtractorError::NotInitialized("thread_id not set".to_string()))?;
|
|
||||||
|
|
||||||
log::info!("Extracting XenForo thread: {}", thread_id);
|
|
||||||
|
|
||||||
let mut messages = Vec::new();
|
|
||||||
|
|
||||||
let mut dir_msg = Message::directory("");
|
|
||||||
dir_msg.metadata.insert("thread_id".to_string(), serde_json::json!(thread_id.parse::<i64>().unwrap_or(0)));
|
|
||||||
messages.push(dir_msg);
|
|
||||||
|
|
||||||
log::info!("Found XenForo thread {}", thread_id);
|
|
||||||
Ok(messages)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for XenforoThreadExtractor {
|
// ============================================================================
|
||||||
fn default() -> Self {
|
// XenforoForumExtractor
|
||||||
Self::new().expect("Failed to create XenforoThreadExtractor")
|
// ============================================================================
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Clone for XenforoThreadExtractor {
|
pub struct XenforoForumExtractor {
|
||||||
fn clone(&self) -> Self {
|
pattern: Regex,
|
||||||
Self {
|
category: String,
|
||||||
pattern: self.pattern.clone(),
|
subcategory: String,
|
||||||
category: self.category.clone(),
|
root_url: String,
|
||||||
subcategory: self.subcategory.clone(),
|
domain: Option<String>,
|
||||||
root_url: self.root_url.clone(),
|
forum_path: Option<String>,
|
||||||
thread_id: self.thread_id.clone(),
|
cookies: HashMap<String, String>,
|
||||||
page: self.page.clone(),
|
|
||||||
client: HttpClient::builder()
|
|
||||||
.user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
|
||||||
.build()
|
|
||||||
.expect("Failed to create HTTP client"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait]
|
|
||||||
impl Extractor for XenforoThreadExtractor {
|
|
||||||
fn category(&self) -> &str { &self.category }
|
|
||||||
fn subcategory(&self) -> &str { &self.subcategory }
|
|
||||||
fn root(&self) -> &str { &self.root_url }
|
|
||||||
fn pattern(&self) -> &Regex { &self.pattern }
|
|
||||||
fn clone_extractor(&self) -> Box<dyn Extractor> { Box::new(self.clone()) }
|
|
||||||
|
|
||||||
async fn initialize(&mut self, m: ExtractorMatch) -> Result<(), ExtractorError> {
|
|
||||||
if let Some(captures) = self.pattern.captures(&m.url) {
|
|
||||||
if let Some(id) = captures.get(2) {
|
|
||||||
self.thread_id = Some(id.as_str().to_string());
|
|
||||||
}
|
|
||||||
if let Some(page) = captures.get(3) {
|
|
||||||
self.page = Some(page.as_str().parse::<i64>().unwrap_or(1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn items(&mut self) -> Result<Vec<Message>, ExtractorError> {
|
|
||||||
self.extract_thread().await
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl XenforoForumExtractor {
|
impl XenforoForumExtractor {
|
||||||
@@ -213,11 +626,9 @@ impl XenforoForumExtractor {
|
|||||||
category: "xenforo".to_string(),
|
category: "xenforo".to_string(),
|
||||||
subcategory: "forum".to_string(),
|
subcategory: "forum".to_string(),
|
||||||
root_url: "https://simpcity.cr".to_string(),
|
root_url: "https://simpcity.cr".to_string(),
|
||||||
forum_id: None,
|
domain: None,
|
||||||
client: HttpClient::builder()
|
forum_path: None,
|
||||||
.user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
cookies: HashMap::new(),
|
||||||
.build()
|
|
||||||
.map_err(|e| ExtractorError::ConfigError(e.to_string()))?,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -235,11 +646,9 @@ impl Clone for XenforoForumExtractor {
|
|||||||
category: self.category.clone(),
|
category: self.category.clone(),
|
||||||
subcategory: self.subcategory.clone(),
|
subcategory: self.subcategory.clone(),
|
||||||
root_url: self.root_url.clone(),
|
root_url: self.root_url.clone(),
|
||||||
forum_id: self.forum_id.clone(),
|
domain: self.domain.clone(),
|
||||||
client: HttpClient::builder()
|
forum_path: self.forum_path.clone(),
|
||||||
.user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
cookies: self.cookies.clone(),
|
||||||
.build()
|
|
||||||
.expect("Failed to create HTTP client"),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -254,17 +663,25 @@ impl Extractor for XenforoForumExtractor {
|
|||||||
|
|
||||||
async fn initialize(&mut self, m: ExtractorMatch) -> Result<(), ExtractorError> {
|
async fn initialize(&mut self, m: ExtractorMatch) -> Result<(), ExtractorError> {
|
||||||
if let Some(captures) = self.pattern.captures(&m.url) {
|
if let Some(captures) = self.pattern.captures(&m.url) {
|
||||||
if let Some(id) = captures.get(1) {
|
if let Some(domain) = captures.get(1) {
|
||||||
self.forum_id = Some(id.as_str().to_string());
|
self.domain = Some(domain.as_str().to_string());
|
||||||
|
self.root_url = root_for_domain(domain.as_str());
|
||||||
|
}
|
||||||
|
if let Some(path) = captures.get(2) {
|
||||||
|
self.forum_path = Some(path.as_str().to_string());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn items(&mut self) -> Result<Vec<Message>, ExtractorError> {
|
async fn items(&mut self) -> Result<Vec<Message>, ExtractorError> {
|
||||||
log::info!("Extracting XenForo forum");
|
log::info!("XenForo forum extractor not yet implemented");
|
||||||
Ok(vec![])
|
Ok(vec![])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn set_cookies(&mut self, cookies: HashMap<String, String>) {
|
||||||
|
self.cookies = cookies;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -282,4 +699,116 @@ mod tests {
|
|||||||
let extractor = XenforoThreadExtractor::new().unwrap();
|
let extractor = XenforoThreadExtractor::new().unwrap();
|
||||||
assert!(extractor.pattern.is_match("https://simpcity.cr/threads/TITLE.12345/"));
|
assert!(extractor.pattern.is_match("https://simpcity.cr/threads/TITLE.12345/"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_thread_pattern_with_page() {
|
||||||
|
let extractor = XenforoThreadExtractor::new().unwrap();
|
||||||
|
let url = "https://simpcity.cr/threads/dimeestevez.39618/page-2";
|
||||||
|
assert!(extractor.pattern.is_match(url));
|
||||||
|
|
||||||
|
let caps = extractor.pattern.captures(url).unwrap();
|
||||||
|
assert_eq!(caps.get(1).unwrap().as_str(), "simpcity.cr");
|
||||||
|
assert_eq!(caps.get(2).unwrap().as_str(), "/threads/dimeestevez.39618");
|
||||||
|
assert_eq!(caps.get(3).unwrap().as_str(), "39618");
|
||||||
|
assert_eq!(caps.get(4).unwrap().as_str(), "2");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_media_from_html() {
|
||||||
|
let html = r#"
|
||||||
|
<img src="https://example.com/image1.jpg" class="bbImage " loading="lazy" />
|
||||||
|
<video src="https://example.com/video.mp4"></video>
|
||||||
|
<a href="https://example.com/attachments/file.zip">Download</a>
|
||||||
|
"#;
|
||||||
|
let urls = extract_media_from_html(html, "https://simpcity.cr");
|
||||||
|
assert_eq!(urls.len(), 3);
|
||||||
|
assert!(urls.contains(&"https://example.com/image1.jpg".to_string()));
|
||||||
|
assert!(urls.contains(&"https://example.com/video.mp4".to_string()));
|
||||||
|
assert!(urls.contains(&"https://example.com/attachments/file.zip".to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_media_skips_smilies() {
|
||||||
|
let html = r#"
|
||||||
|
<img src="https://simpcity.cr/styles/emoji.png" class="bbImage" />
|
||||||
|
<img src="https://example.com/real-image.jpg" class="bbImage " loading="lazy" />
|
||||||
|
"#;
|
||||||
|
let urls = extract_media_from_html(html, "https://simpcity.cr");
|
||||||
|
assert_eq!(urls.len(), 1);
|
||||||
|
assert_eq!(urls[0], "https://example.com/real-image.jpg");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_upgrade_thumbnail() {
|
||||||
|
assert_eq!(
|
||||||
|
upgrade_thumbnail("https://simp1.selti-delivery.ru/images/test.md.jpg"),
|
||||||
|
"https://simp1.selti-delivery.ru/images/test.jpg"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
upgrade_thumbnail("https://example.com/image.jpg"),
|
||||||
|
"https://example.com/image.jpg"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_posts_from_real_html() {
|
||||||
|
let html = r#"
|
||||||
|
<article class="message" data-content="post-111" id="js-post-111">
|
||||||
|
<article class="message-body js-selectToQuote">
|
||||||
|
<img src="https://cdn.example.com/img1.jpg" class="bbImage " />
|
||||||
|
</article>
|
||||||
|
</article>
|
||||||
|
<article class="message" data-content="post-222" id="js-post-222">
|
||||||
|
<article class="message-body js-selectToQuote">
|
||||||
|
<img src="https://cdn.example.com/img2.jpg" class="bbImage " />
|
||||||
|
</article>
|
||||||
|
</article>
|
||||||
|
"#;
|
||||||
|
let posts = extract_posts(html);
|
||||||
|
assert_eq!(posts.len(), 2);
|
||||||
|
assert_eq!(posts[0].0, "111");
|
||||||
|
assert_eq!(posts[1].0, "222");
|
||||||
|
|
||||||
|
// Each post should yield its own image
|
||||||
|
let urls1 = extract_media_from_html(&posts[0].1, "https://simpcity.cr");
|
||||||
|
assert_eq!(urls1.len(), 1);
|
||||||
|
assert!(urls1[0].contains("img1.jpg"));
|
||||||
|
|
||||||
|
let urls2 = extract_media_from_html(&posts[1].1, "https://simpcity.cr");
|
||||||
|
assert_eq!(urls2.len(), 1);
|
||||||
|
assert!(urls2[0].contains("img2.jpg"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_find_next_page() {
|
||||||
|
let html = r#"<a href="/threads/test.123/page-2" class="pageNav-jump pageNav-jump--next">Next</a>"#;
|
||||||
|
assert_eq!(find_next_page(html), Some("/threads/test.123/page-2".to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_find_next_page_none() {
|
||||||
|
let html = r#"<div>no pagination here</div>"#;
|
||||||
|
assert_eq!(find_next_page(html), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_thread_title() {
|
||||||
|
let html = r#"<h1 class="p-title-value">Thread Title Here</h1>"#;
|
||||||
|
assert_eq!(extract_thread_title(html), Some("Thread Title Here".to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_url_filename() {
|
||||||
|
assert_eq!(
|
||||||
|
url_filename("https://example.com/path/to/image.jpg"),
|
||||||
|
Some("image.jpg".to_string())
|
||||||
|
);
|
||||||
|
assert_eq!(url_filename("https://example.com/"), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_root_for_domain() {
|
||||||
|
assert_eq!(root_for_domain("simpcity.cr"), "https://simpcity.cr");
|
||||||
|
assert_eq!(root_for_domain("nudostar.com/forum"), "https://nudostar.com/forum");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
94
src/main.rs
94
src/main.rs
@@ -86,6 +86,17 @@ fn write_page_dump(url: &str, items: &[Message]) {
|
|||||||
let _ = std::fs::write(path, out);
|
let _ = std::fs::write(path, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Extract a usable filename from a URL path
|
||||||
|
fn url_to_filename(url: &str) -> Option<String> {
|
||||||
|
let parsed = url::Url::parse(url).ok()?;
|
||||||
|
let path = parsed.path();
|
||||||
|
let segment = path.rsplit('/').next()?;
|
||||||
|
if segment.is_empty() || !segment.contains('.') {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
urlencoding::decode(segment).ok().map(|s| s.into_owned())
|
||||||
|
}
|
||||||
|
|
||||||
fn render_filename(pattern: Option<&str>, index: usize, item: &Message) -> String {
|
fn render_filename(pattern: Option<&str>, index: usize, item: &Message) -> String {
|
||||||
if let Some(template) = pattern {
|
if let Some(template) = pattern {
|
||||||
let ext = item.extension().unwrap_or_else(|| "bin".to_string());
|
let ext = item.extension().unwrap_or_else(|| "bin".to_string());
|
||||||
@@ -1092,9 +1103,14 @@ fn main() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if let Some(ref browser) = args.cookies_from_browser {
|
} else if let Some(ref browser) = args.cookies_from_browser {
|
||||||
match gallery_dl::extract_browser_cookies(browser, None) {
|
// Extract the domain from input URLs to filter browser cookies
|
||||||
|
let domain_filter: Option<String> = args.urls.first()
|
||||||
|
.and_then(|u| url::Url::parse(u).ok())
|
||||||
|
.and_then(|u| u.host_str().map(|h| h.to_string()));
|
||||||
|
|
||||||
|
match gallery_dl::extract_browser_cookies(browser, domain_filter.as_deref()) {
|
||||||
Ok(c) => {
|
Ok(c) => {
|
||||||
log::info!("Extracted {} cookies from browser '{}'", c.len(), browser);
|
log::info!("Extracted {} cookies from browser '{}' (domain filter: {:?})", c.len(), browser, domain_filter);
|
||||||
Some(c)
|
Some(c)
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
@@ -1644,36 +1660,66 @@ fn main() {
|
|||||||
let mut metadata_by_url: HashMap<String, HashMap<String, serde_json::Value>> =
|
let mut metadata_by_url: HashMap<String, HashMap<String, serde_json::Value>> =
|
||||||
HashMap::new();
|
HashMap::new();
|
||||||
|
|
||||||
// Determine download directory: CLI arg > config > default
|
// Determine base download directory: CLI arg > config > default (Pictures/gallery-dl)
|
||||||
let download_dir = args.directory.clone()
|
let base_dir = args.directory.clone()
|
||||||
.or_else(|| args.destination.clone())
|
.or_else(|| args.destination.clone())
|
||||||
.or_else(|| config.downloader.directory.clone())
|
.or_else(|| config.downloader.directory.clone())
|
||||||
.unwrap_or_else(|| PathBuf::from("."));
|
.unwrap_or_else(|| {
|
||||||
|
dirs::picture_dir()
|
||||||
|
.unwrap_or_else(|| PathBuf::from("."))
|
||||||
|
.join("gallery-dl")
|
||||||
|
});
|
||||||
|
|
||||||
|
// Extract directory metadata from the first Directory message
|
||||||
|
// to build subdirectory path: {category}/{title}/
|
||||||
|
let mut dir_category = String::new();
|
||||||
|
let mut dir_title = String::new();
|
||||||
|
for item in items.iter() {
|
||||||
|
if matches!(item.kind, MessageKind::Directory) {
|
||||||
|
if let Some(cat) = item.metadata.get("category") {
|
||||||
|
dir_category = cat.as_str().unwrap_or("").to_string();
|
||||||
|
}
|
||||||
|
if let Some(title) = item.metadata.get("title") {
|
||||||
|
dir_title = title.as_str().unwrap_or("").to_string();
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build the download directory with subdirectories
|
||||||
|
let download_dir = if !dir_category.is_empty() || !dir_title.is_empty() {
|
||||||
|
let cat = if dir_category.is_empty() { "other".to_string() } else {
|
||||||
|
sanitize_filename(&dir_category, args.restrict_filenames, true)
|
||||||
|
};
|
||||||
|
let title = if dir_title.is_empty() { "untitled".to_string() } else {
|
||||||
|
sanitize_filename(&dir_title, args.restrict_filenames, true)
|
||||||
|
};
|
||||||
|
base_dir.join(cat).join(title)
|
||||||
|
} else {
|
||||||
|
base_dir.clone()
|
||||||
|
};
|
||||||
|
|
||||||
for (j, item) in items.iter().enumerate() {
|
for (j, item) in items.iter().enumerate() {
|
||||||
if !matches!(item.kind, MessageKind::Url | MessageKind::Queue) {
|
if !matches!(item.kind, MessageKind::Url | MessageKind::Queue) {
|
||||||
println!(" [{}] Skipping non-download message ({:?})", j + 1, item.kind);
|
println!(" [{}] Skipping non-download message ({:?})", j + 1, item.kind);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut template_pattern = args
|
// Use the extractor-provided filename, or derive from URL, or fall back to template
|
||||||
.rename_to
|
let filename = if let Some(ref f) = item.filename {
|
||||||
.as_deref()
|
f.clone()
|
||||||
.or(args.rename.as_deref())
|
} else if let Some(f) = url_to_filename(&item.url) {
|
||||||
.or(args.filename.as_deref())
|
f
|
||||||
.or(config.downloader.filename.as_deref());
|
} else {
|
||||||
|
let template_pattern = args
|
||||||
if template_pattern.is_none() {
|
.rename_to
|
||||||
template_pattern = Some("{num}.{ext}");
|
.as_deref()
|
||||||
}
|
.or(args.rename.as_deref())
|
||||||
|
.or(args.filename.as_deref())
|
||||||
// Create a simple destination path based on the URL
|
.or(config.downloader.filename.as_deref())
|
||||||
// In a full implementation, this would use path templates
|
.unwrap_or("{num}.{ext}");
|
||||||
let filename = render_filename(
|
render_filename(Some(template_pattern), j, item)
|
||||||
template_pattern,
|
};
|
||||||
j,
|
|
||||||
item,
|
|
||||||
);
|
|
||||||
let filename = sanitize_filename(
|
let filename = sanitize_filename(
|
||||||
&filename,
|
&filename,
|
||||||
args.restrict_filenames,
|
args.restrict_filenames,
|
||||||
|
|||||||
Reference in New Issue
Block a user