From efa8e3f091d196228215be82adfe1db20a5595ab Mon Sep 17 00:00:00 2001 From: jan Date: Sun, 18 Sep 2016 16:50:23 +0200 Subject: initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..eb5a316 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +target diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..534b5ca --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "acd_crawl" +version = "0.1.0" +authors = ["jan "] + +[dependencies] +hyper = "0.9" +rand = "0.3" +log = "0.3" +env_logger = "0.3" \ No newline at end of file diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..69a9c74 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,75 @@ +extern crate hyper; +use hyper::client::Client; +use hyper::header::UserAgent; +use hyper::status::StatusCode; + +#[macro_use] extern crate log; +extern crate env_logger; + +extern crate rand; +use rand::{ thread_rng, Rng}; + +use std::fs; +use std::fs::File; +use std::sync::Arc; +use std::ops::Deref; +use std::io::prelude::*; +use std::env; +use std::thread; +use std::time::Duration; +use std::str::FromStr; + +use std::path::Path; + +static MAX_CHARS: u32 = 77784; +static BASE_URL: &'static str = "http://www.animecharactersdatabase.com/character.php?id="; + +fn get_next(base_path: &str) -> u32 { + let base = Path::new(base_path); + + let mut num: u32 = thread_rng().gen_range(1, MAX_CHARS); + while base.join(format!("{}.html", num)).exists() { + num = thread_rng().gen_range(1, MAX_CHARS); + } + return num; +} + +fn download(client: &Client, base_path: &str, char: u32) { + debug!("downloading character {}", char); + let mut res = client.get(&format!("{}{}", BASE_URL, char)) + .header(UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14915".into())) + .send() + .expect("could not send request"); + + if res.status != StatusCode::Ok { + panic!("invalid status code {}", res.status); + } + debug!("got response"); + + let mut buf = String::new(); + res.read_to_string(&mut buf).expect("could not read response"); + + // v I don't know whether that works anymore + if buf.find("IP ban").is_some() { + panic!("WE ARE BANNED"); + } + + let mut f = File::create(Path::new(base_path).join(format!("{}.html", char))).expect("could not create file"); + + f.write_all(buf.as_bytes()).expect("could not write response to file"); +} + +fn main() { + let timeout = u64::from_str(&(env::var("TIMEOUT").expect("TIMEOUT env var not set"))).expect("invalid TIMEOUT env var"); + let base_path = env::var("OUT").expect("OUT not set"); + env_logger::init().unwrap(); + fs::create_dir_all("html").ok(); + + let client = Arc::new(Client::new()); + + loop { + info!("getting next character"); + download(client.clone().deref(), &base_path, get_next(&base_path)); + thread::sleep(Duration::from_secs(timeout)); + } +} -- cgit v0.10.1