feat: batch stuff

This commit is contained in:
2025-04-13 16:43:35 +02:00
parent 8d72c227cd
commit 4f1e6cea38

View File

@@ -1,3 +1,4 @@
use indicatif::{ProgressBar, ProgressStyle};
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use regex::Regex; use regex::Regex;
use std::fs::File; use std::fs::File;
@@ -5,9 +6,10 @@ use std::io::Read;
use std::process::Command; use std::process::Command;
use rusqlite::{Connection, Result}; use rusqlite::{Connection, Result};
use std::time::Instant; use std::time::Instant;
use rayon::prelude::*;
fn main() -> Result<(), Box<dyn std::error::Error>> { fn main() -> Result<(), Box<dyn std::error::Error>> {
let db = Connection::open("video_ids.db")?; let mut db = Connection::open("video_ids.db")?;
db.execute( db.execute(
"CREATE TABLE IF NOT EXISTS video_ids ( "CREATE TABLE IF NOT EXISTS video_ids (
@@ -17,12 +19,60 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
[], [],
)?; )?;
get_ids_history(db)?; get_ids_history(&mut db)?;
let mut stmt = db.prepare("SELECT id FROM video_ids WHERE duration IS NULL OR duration = ''")?;
let ids_to_fetch: Vec<String> = stmt.query_map([], |row| row.get(0))?
.collect::<Result<Vec<String>, _>>()?;
stmt.finalize()?;
let total_ids = ids_to_fetch.len();
if total_ids == 0 {
println!("No IDs to fetch.");
return Ok(());
}
let progress_bar = ProgressBar::new(total_ids as u64);
progress_bar.set_style(ProgressStyle::default_bar()
.template("{msg} [{elapsed_precise}] {wide_bar} {pos}/{len} ({eta})")?
.progress_chars("##-"));
progress_bar.set_message("Fetching video durations...");
let start_time = Instant::now();
let batch_size = 30;
let results: Vec<(String, String)> = ids_to_fetch
.par_chunks(batch_size)
.flat_map(|chunk| {
chunk.iter().map(|id| {
let duration = get_video_duration(id).unwrap_or_else(|_| "Error".to_string());
progress_bar.inc(1);
(id.clone(), duration)
}).collect::<Vec<_>>()
})
.collect();
let elapsed_time = start_time.elapsed();
println!("Fetching video durations took: {:?}", elapsed_time);
let update_start = Instant::now();
let tx = db.transaction()?;
{
let mut stmt = tx.prepare_cached("UPDATE video_ids SET duration = ? WHERE id = ?")?;
for (id, duration) in results {
stmt.execute(rusqlite::params![duration, id])?;
progress_bar.inc(1);
}
}
tx.commit()?;
progress_bar.finish_with_message("Done!");
let update_duration = update_start.elapsed();
println!("Updating database took: {:?}", update_duration);
println!("Total time taken: {:?}", elapsed_time + update_duration);
Ok(()) Ok(())
} }
fn get_ids_history(mut db: Connection) -> Result<(), Box<dyn std::error::Error>> { fn get_ids_history(db: &mut Connection) -> Result<(), Box<dyn std::error::Error>> {
let read_start = Instant::now(); let read_start = Instant::now();
let mut id_db: Vec<&str> = Vec::new(); let mut id_db: Vec<&str> = Vec::new();
@@ -55,7 +105,7 @@ fn get_ids_history(mut db: Connection) -> Result<(), Box<dyn std::error::Error>>
// database inserting // database inserting
let insert_start = Instant::now(); let insert_start = Instant::now();
let tx = db.transaction()?; let tx = db.transaction()?;
{ {
let mut stmt = tx.prepare_cached("INSERT OR IGNORE INTO video_ids (id) VALUES (?)")?; let mut stmt = tx.prepare_cached("INSERT OR IGNORE INTO video_ids (id) VALUES (?)")?;
@@ -75,20 +125,32 @@ fn get_ids_history(mut db: Connection) -> Result<(), Box<dyn std::error::Error>>
fn get_video_duration(id: &str) -> Result<String, Box<dyn std::error::Error>> { fn get_video_duration(id: &str) -> Result<String, Box<dyn std::error::Error>> {
let url = format!("https://www.youtube.com/watch?v={}", id); let url = format!("https://www.youtube.com/watch?v={}", id);
let output = Command::new("yt-dlp") // Add timeout and retry logic
.args(&[ let max_retries = 3;
"--get-duration", let mut retries = 0;
url.as_str(),
])
.output()?;
if output.status.success() { while retries < max_retries {
let stdout = String::from_utf8_lossy(&output.stdout); let output = Command::new("yt-dlp")
let video_url = stdout.lines().next().unwrap_or(""); .args(&[
return Ok(video_url.to_string()); "--get-duration",
} else { "--no-warnings",
let stderr = String::from_utf8_lossy(&output.stderr); "--socket-timeout", "10",
eprintln!("Error: {}", stderr); url.as_str(),
return Err("Failed to get video link".into()); ])
.output()?;
if output.status.success() {
let stdout = String::from_utf8_lossy(&output.stdout);
let duration = stdout.trim();
if !duration.is_empty() {
return Ok(duration.to_string());
}
}
retries += 1;
std::thread::sleep(std::time::Duration::from_millis(500 * retries));
} }
}
Err(format!("Failed to get duration for video ID: {}", id).into())
}