From 1400381e9c193eb7f681d95616eed9070edaab56 Mon Sep 17 00:00:00 2001 From: rtz12 Date: Tue, 29 Mar 2016 17:58:55 +0200 Subject: Crawler hinzugefuegt diff --git a/tools/charcrawler/.gitignore b/tools/charcrawler/.gitignore new file mode 100644 index 0000000..02a07a1 --- /dev/null +++ b/tools/charcrawler/.gitignore @@ -0,0 +1,2 @@ +data/ +*.txt diff --git a/tools/charcrawler/crawler/acd.go b/tools/charcrawler/crawler/acd.go new file mode 100644 index 0000000..31efb0d --- /dev/null +++ b/tools/charcrawler/crawler/acd.go @@ -0,0 +1,81 @@ +package crawler + +import ( + "log" + "strconv" + "strings" + + "github.com/PuerkitoBio/goquery" +) + +type ACDCrawler struct{} + +func (a ACDCrawler) Name() string { + return "ACD" +} + +func (a ACDCrawler) Crawl(id int) (CharacterData, error) { + c := make(CharacterData) + doc, err := goquery.NewDocument("http://www.animecharactersdatabase.com/character.php?id=" + strconv.Itoa(id)) + if err != nil { + log.Println(err) + return nil, CrawlError + } + text := doc.Text() + if strings.Contains(text, "bad character : try") { + return nil, CharacterNotFound + } + dataTr := doc.Find("#besttable").Next().ChildrenFiltered("tbody").ChildrenFiltered("tr") + leftSide := dataTr.Children().Eq(0) + rightSide := dataTr.Children().Eq(1) + imageCols := rightSide.ChildrenFiltered("table").First().Find("tbody > tr > td") + val, _ := imageCols.Eq(0).Find("img").Attr("src") + c["__thumb"] = val + val, _ = imageCols.Eq(1).Find("a").Attr("href") + c["__img"] = val + leftSide.ChildrenFiltered("table").Eq(1).Find("tr").Each(func(i int, s *goquery.Selection) { + c[s.Find("th").Text()] = s.Find("td").Text() + }) + var key string + leftSide. + ChildrenFiltered("table"). + Eq(0). + Find("td"). + Eq(1). + Find("dl"). + Children(). + Each(func(i int, s *goquery.Selection) { + switch goquery.NodeName(s) { + case "dt": + key = s.Text() + case "dd": + c[key] = s.Text() + } + }) + tags := make([]string, 0) + leftSide.ChildrenFiltered("div").Eq(0).Find("a").Each(func(i int, s *goquery.Selection) { + tags = append(tags, s.Text()) + }) + c["__tags"] = tags + vas := make([]string, 0) + leftSide.ChildrenFiltered("div").Eq(1).Find("a").Each(func(i int, s *goquery.Selection) { + vas = append(vas, s.Text()) + }) + c["__vas"] = vas + leftSide.ChildrenFiltered("dl").Children().Each(func(i int, s *goquery.Selection) { + switch goquery.NodeName(s) { + case "dt": + key = s.Text() + case "dd": + c[key] = s.Text() + } + }) + apps := make([]int, 0) + rightSide.Find(".tile3top").Each(func(i int, s *goquery.Selection) { + val, _ = s.Find("a").Attr("href") + id, _ := strconv.Atoi(strings.Split(val, "=")[1]) + apps = append(apps, id) + }) + c["__appearances"] = apps + return c, nil +} diff --git a/tools/charcrawler/crawler/acd_anime.go b/tools/charcrawler/crawler/acd_anime.go new file mode 100644 index 0000000..106655d --- /dev/null +++ b/tools/charcrawler/crawler/acd_anime.go @@ -0,0 +1,41 @@ +package crawler + +import ( + "log" + "strconv" + "strings" + + "github.com/PuerkitoBio/goquery" +) + +type ACDAnimeCrawler struct{} + +func (a ACDAnimeCrawler) Name() string { + return "ACDAnime" +} + +func (a ACDAnimeCrawler) Crawl(id int) (CharacterData, error) { + c := make(CharacterData) + doc, err := goquery.NewDocument("http://www.animecharactersdatabase.com/series.php?id=" + strconv.Itoa(id)) + if err != nil { + log.Println(err) + return nil, CrawlError + } + text := doc.Text() + if strings.Contains(text, "bad series : try") { + return nil, CharacterNotFound + } + tds := doc.Find("#besttable > table > tbody > tr > td") + val, _ := tds.Eq(0).Find("img").Attr("src") + c["__thumb"] = val + tds.Eq(1).Find("tr").Each(func(i int, s *goquery.Selection) { + key := s.Find("th").Text() + value := s.Find("td").Text() + if key == "Home Page" { + val, _ = s.Find("td > a").Attr("href") + value = val + } + c[key] = value + }) + return c, nil +} diff --git a/tools/charcrawler/crawler/anidb.go b/tools/charcrawler/crawler/anidb.go new file mode 100644 index 0000000..12bf05b --- /dev/null +++ b/tools/charcrawler/crawler/anidb.go @@ -0,0 +1,69 @@ +package crawler + +import ( + "log" + "strconv" + "strings" + + "github.com/PuerkitoBio/goquery" +) + +type AniDBCrawler struct{} + +func (a AniDBCrawler) Name() string { + return "AniDB" +} + +func (a AniDBCrawler) Crawl(id int) (CharacterData, error) { + c := make(CharacterData) + doc, err := goquery.NewDocument("http://anidb.net/perl-bin/animedb.pl?show=character&charid=" + strconv.Itoa(id)) + if err != nil { + log.Println(err) + return nil, CrawlError + } + siteErr, _ := doc.Find(".error p").Html() + if strings.Contains(siteErr, "Unknown character id") { + return nil, CharacterNotFound + } else if strings.Contains(siteErr, "BANNED") { + return nil, CrawlError + } + first := true + links := make([]string, 0) + doc.Find(".characters").Each(func(i int, s *goquery.Selection) { + if first { + s.Find(".g_definitionlist tr").Each(func(i int, s *goquery.Selection) { + keyHtml, _ := s.Find("th").Html() + valueHtml, _ := s.Find("td").Html() + key := strings.TrimSpace(keyHtml) + value := strings.TrimSpace(valueHtml) + c[key] = value + }) + desc, _ := s.Find(".desc").Html() + c["__desc"] = strings.TrimSpace(desc) + imgUrl, _ := s.Find(".image img").Attr("src") + c["__img"] = imgUrl + animes := make([]map[string]string, 0) + s.Find(".animelist tr").Each(func(i int, s *goquery.Selection) { + if s.HasClass("header") { + return + } + anime := make(map[string]string) + relHtml, _ := s.Find(".relation").Html() + nameHtml, _ := s.Find(".name a").Html() + rel := strings.TrimSpace(relHtml) + name := strings.TrimSpace(nameHtml) + anime["rel"] = rel + anime["name"] = name + animes = append(animes, anime) + }) + c["__animes"] = animes + } else { + link, _ := s.Find(".mainname a").Attr("href") + links = append(links, strings.Replace(link, "http://anidb.net/ch", "", 1)) + } + first = false + }) + c["__links"] = links + + return c, nil +} diff --git a/tools/charcrawler/crawler/crawler.go b/tools/charcrawler/crawler/crawler.go new file mode 100644 index 0000000..337bc76 --- /dev/null +++ b/tools/charcrawler/crawler/crawler.go @@ -0,0 +1,67 @@ +package crawler + +import ( + "encoding/json" + "errors" + "io/ioutil" + "log" + "os" + "strconv" + "strings" + "time" +) + +type CharacterData map[string]interface{} + +type Crawler interface { + Name() string + Crawl(id int) (CharacterData, error) +} + +var Instances []Crawler + +var ( + CrawlError = errors.New("Error while crawling") + CharacterNotFound = errors.New("Character not found") +) + +func Start(c Crawler) { + name := c.Name() + os.MkdirAll("data/"+name, 0755) + log.Printf("Starting Crawler %s...", name) + ticker := time.NewTicker(time.Second * 1) + current := 1 + if save, err := ioutil.ReadFile(name + ".txt"); err == nil { + s := strings.TrimSpace(string(save)) + if i, err := strconv.Atoi(s); err == nil { + current = i + } + } + faultCounter := 0 + for range ticker.C { + if faultCounter > 100 { + faultCounter = 0 + log.Printf("[%s] Exiting after 100 fails", name) + break + } + log.Printf("[%s] Crawling %d", name, current) + char, err := c.Crawl(current) + switch err { + case CharacterNotFound: + log.Printf("[%s] Char %d not found!", name, current) + faultCounter++ + case CrawlError: + panic(err) + default: + cData, _ := json.Marshal(char) + ioutil.WriteFile("data/"+name+"/"+strconv.Itoa(current)+".json", cData, 0755) + } + + current++ + ioutil.WriteFile(name+".txt", []byte(strconv.Itoa(current)), os.ModePerm) + } +} + +func init() { + Instances = append(Instances, new(ACDCrawler), new(ACDAnimeCrawler)) +} diff --git a/tools/charcrawler/main b/tools/charcrawler/main new file mode 100755 index 0000000..71f9896 Binary files /dev/null and b/tools/charcrawler/main differ diff --git a/tools/charcrawler/main.go b/tools/charcrawler/main.go new file mode 100644 index 0000000..0c94151 --- /dev/null +++ b/tools/charcrawler/main.go @@ -0,0 +1,16 @@ +package main + +import ( + "time" + + "private/charcrawler/crawler" +) + +func main() { + for _, c := range crawler.Instances { + go crawler.Start(c) + } + for { + time.Sleep(time.Minute) + } +} -- cgit v0.10.1