initial commit

This commit is contained in:
winneratwin 2022-06-08 02:11:59 +01:00
commit d646d23895
Signed by: winneratwin
GPG Key ID: CDBC42F8803D689E
5 changed files with 3852 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/target

1516
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

11
Cargo.toml Normal file
View File

@ -0,0 +1,11 @@
[package]
name = "european-youth-card-discount-scraper"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
reqwest = { version = "0.11", features = [] }
tokio = { version = "1", features = ["full"] }
scraper = "0.13.0"

2246
deals.txt Normal file

File diff suppressed because it is too large Load Diff

78
src/main.rs Normal file
View File

@ -0,0 +1,78 @@
use scraper::{Html, Selector};
#[tokio::main]
async fn main() {
let url_base = "https://www.eyca.org/discounts/gb/";
let response = reqwest::get(url_base.to_string() + &1.to_string()).await.unwrap();
let html = response.text().await.unwrap();
let document = Html::parse_document(&html);
println!("page: 1");
print_discounts(&document);
let selector = Selector::parse("div.paginate-pages > a").unwrap();
let paginate = document.select(&selector).map(|node| node.text().next().unwrap()).collect::<Vec<_>>();
let page_count = paginate[paginate.len() - 2].parse::<i32>().unwrap();
for page_number in 2..=page_count {
println!("page: {}", page_number);
let response = reqwest::get(url_base.to_string() + &page_number.to_string()).await.unwrap();
let html = response.text().await.unwrap();
let document = Html::parse_document(&html);
print_discounts(&document);
}
//print pretty html
//println!("{:#?}", document.root_element().html());
}
fn print_discounts(document: &Html) {
let domain_base = "https://www.eyca.org";
let selector = Selector::parse("article.p025 > div > a").unwrap();
for article in document.select(&selector) {
//get the href attribute
println!("link for deal: {}{}", domain_base,article.value().attr("href").unwrap());
//get company name
let selector = Selector::parse("div > h2").unwrap();
let company_name = article.select(&selector).next().unwrap().text().next().unwrap();
println!("company name: {}", company_name);
//get description
let selector = Selector::parse("div > p").unwrap();
let description = article.select(&selector).next().unwrap().text().next().unwrap();
println!("description: {}", description);
//get tags(if any)
let selector = Selector::parse("div > div > ul > li").unwrap();
let tags = article.select(&selector);
if tags.clone().count() > 0 {
print!("tags:");
for tag in tags {
print!(" \"{}\"", tag.text().next().unwrap());
}
println!();
} else {
println!("tags: None");
}
//get category and locations
let selector = Selector::parse("div.card-content.p1.sd-p2 > ul > li").unwrap();
let mut category_and_locations = article.select(&selector);
println!("category: {}", category_and_locations.next().unwrap().text().next().unwrap());
println!("locations: {}", category_and_locations.next().unwrap().text().next().unwrap().trim());
//print separator
println!("-----------------------------------------------------");
}
}