diff options
-rw-r--r-- | modules/grils/grils.go | 65 | ||||
-rw-r--r-- | tools/charcrawler/.gitignore | 3 | ||||
-rw-r--r-- | tools/charcrawler/crawler/acd.go | 84 | ||||
-rw-r--r-- | tools/charcrawler/crawler/acd_anime.go | 44 | ||||
-rw-r--r-- | tools/charcrawler/crawler/anidb.go | 69 | ||||
-rw-r--r-- | tools/charcrawler/crawler/crawler.go | 70 | ||||
-rw-r--r-- | tools/charcrawler/main.go | 16 |
7 files changed, 346 insertions, 5 deletions
diff --git a/modules/grils/grils.go b/modules/grils/grils.go index 791ea5e..116b689 100644 --- a/modules/grils/grils.go +++ b/modules/grils/grils.go | |||
@@ -159,16 +159,71 @@ func (m *GrilsModule) FromID(id int) (*Gril, error) { | |||
159 | gril := &Gril{ | 159 | gril := &Gril{ |
160 | ID: id, | 160 | ID: id, |
161 | } | 161 | } |
162 | var tags []byte | 162 | t1 := time.Now() |
163 | var otherNames []byte | 163 | err := m.g.DB.QueryRow(`SELECT updated_at, age, birthday FROM grilist.grils WHERE id = $1`, id).Scan(&gril.UpdatedAt, &gril.Age, &gril.Birthday) |
164 | err := m.g.DB.QueryRow(`SELECT kanji_name, romaji_name, other_names, updated_at, age, birthday, tags FROM grilist.grils_flattened WHERE id = $1`, id).Scan(&gril.KanjiName, &gril.RomajiName, &otherNames, &gril.UpdatedAt, &gril.Age, &gril.Birthday, &tags) | 164 | log.Printf("get_gril_from_id_raw: %dms", time.Since(t1).Nanoseconds()/1000000) |
165 | 165 | ||
166 | gril.Tags = pgArray(tags) | ||
167 | gril.OtherNames = pgArray(otherNames) | ||
168 | if err != nil { | 166 | if err != nil { |
169 | return nil, err | 167 | return nil, err |
170 | } | 168 | } |
169 | |||
170 | // Namen rausholen | ||
171 | rows, err := m.g.DB.Query(`SELECT name, name_type FROM grilist.gril_names WHERE gril_id = $1`, id) | ||
172 | if err != nil { | ||
173 | return nil, err | ||
174 | } | ||
175 | log.Printf("get_gril_from_id_names: %dms", time.Since(t1).Nanoseconds()/1000000) | ||
176 | defer rows.Close() | ||
177 | |||
178 | for rows.Next() { | ||
179 | var name string | ||
180 | var name_type int | ||
181 | if err := rows.Scan(&name, &name_type); err != nil { | ||
182 | return nil, err | ||
183 | } | ||
184 | |||
185 | switch name_type { | ||
186 | case 0: | ||
187 | gril.KanjiName = name | ||
188 | break | ||
189 | case 1: | ||
190 | gril.RomajiName = name | ||
191 | default: | ||
192 | gril.OtherNames = append(gril.OtherNames, name) | ||
193 | } | ||
194 | } | ||
195 | /*rows, err := m.g.DB.Query(`SELECT name FROM grilist.gril_names WHERE gril_id = $1 AND name_type > 1`, id) | ||
196 | if err != nil { | ||
197 | return nil, err | ||
198 | } | ||
199 | defer rows.Close() | ||
200 | |||
201 | for rows.Next() { | ||
202 | var name string | ||
203 | if err := rows.Scan(&name); err != nil { | ||
204 | return nil, err | ||
205 | } | ||
206 | gril.OtherNames = append(gril.OtherNames, name) | ||
207 | } | ||
208 | rows.Close() | ||
209 | |||
210 | // Tags holen | ||
211 | rows, err := m.g.DB.Query(`SELECT name FROM grilist.tags WHERE id IN (SELECT tag_id FROM grilist.grils_tags WHERE gril_id = $1)`, id) | ||
212 | if err != nil { | ||
213 | return nil, err | ||
214 | } | ||
215 | defer rows.Close() | ||
216 | |||
217 | for rows.Next() { | ||
218 | var name string | ||
219 | if err := rows.Scan(&name); err != nil { | ||
220 | return nil, err | ||
221 | } | ||
222 | gril.Tags = append(gril.Tags, name) | ||
223 | }*/ | ||
224 | |||
171 | m.c.Insert(id, gril) | 225 | m.c.Insert(id, gril) |
226 | log.Printf("get_gril_from_id: %dms", time.Since(t1).Nanoseconds()/1000000) | ||
172 | return gril, nil | 227 | return gril, nil |
173 | } | 228 | } |
174 | 229 | ||
diff --git a/tools/charcrawler/.gitignore b/tools/charcrawler/.gitignore new file mode 100644 index 0000000..961f545 --- /dev/null +++ b/tools/charcrawler/.gitignore | |||
@@ -0,0 +1,3 @@ | |||
1 | data/ | ||
2 | *.txt | ||
3 | charcrawler | ||
diff --git a/tools/charcrawler/crawler/acd.go b/tools/charcrawler/crawler/acd.go new file mode 100644 index 0000000..14f22a2 --- /dev/null +++ b/tools/charcrawler/crawler/acd.go | |||
@@ -0,0 +1,84 @@ | |||
1 | package crawler | ||
2 | |||
3 | import ( | ||
4 | "log" | ||
5 | "strconv" | ||
6 | "strings" | ||
7 | |||
8 | "github.com/PuerkitoBio/goquery" | ||
9 | ) | ||
10 | |||
11 | type ACDCrawler struct{} | ||
12 | |||
13 | func (a ACDCrawler) Name() string { | ||
14 | return "ACD" | ||
15 | } | ||
16 | |||
17 | func (a ACDCrawler) Crawl(id int) (CharacterData, error) { | ||
18 | c := make(CharacterData) | ||
19 | doc, err := goquery.NewDocument("http://www.animecharactersdatabase.com/character.php?id=" + strconv.Itoa(id)) | ||
20 | if err != nil { | ||
21 | log.Println(err) | ||
22 | return nil, CrawlError | ||
23 | } | ||
24 | text := doc.Text() | ||
25 | if strings.Contains(text, "bad character : try") { | ||
26 | return nil, CharacterNotFound | ||
27 | } | ||
28 | if strings.Contains(text, "Your IP has been blocked") { | ||
29 | return nil, Banned | ||
30 | } | ||
31 | dataTr := doc.Find("#profile").ChildrenFiltered("table").Eq(0).ChildrenFiltered("tbody").ChildrenFiltered("tr") | ||
32 | leftSide := dataTr.Children().Eq(0) | ||
33 | rightSide := dataTr.Children().Eq(1) | ||
34 | imageCols := rightSide.ChildrenFiltered("table").First().Find("tbody > tr > td") | ||
35 | val, _ := imageCols.Eq(0).Find("img").Attr("src") | ||
36 | c["__thumb"] = val | ||
37 | val, _ = imageCols.Eq(1).Find("a").Attr("href") | ||
38 | c["__img"] = val | ||
39 | leftSide.ChildrenFiltered("table").Eq(1).Find("tr").Each(func(i int, s *goquery.Selection) { | ||
40 | c[s.Find("th").Text()] = s.Find("td").Text() | ||
41 | }) | ||
42 | var key string | ||
43 | leftSide. | ||
44 | ChildrenFiltered("table"). | ||
45 | Eq(0). | ||
46 | Find("td"). | ||
47 | Eq(1). | ||
48 | Find("dl"). | ||
49 | Children(). | ||
50 | Each(func(i int, s *goquery.Selection) { | ||
51 | switch goquery.NodeName(s) { | ||
52 | case "dt": | ||
53 | key = s.Text() | ||
54 | case "dd": | ||
55 | c[key] = s.Text() | ||
56 | } | ||
57 | }) | ||
58 | tags := make([]string, 0) | ||
59 | leftSide.ChildrenFiltered("div").Eq(0).Find("a").Each(func(i int, s *goquery.Selection) { | ||
60 | tags = append(tags, s.Text()) | ||
61 | }) | ||
62 | c["__tags"] = tags | ||
63 | vas := make([]string, 0) | ||
64 | leftSide.ChildrenFiltered("div").Eq(1).Find("a").Each(func(i int, s *goquery.Selection) { | ||
65 | vas = append(vas, s.Text()) | ||
66 | }) | ||
67 | c["__vas"] = vas | ||
68 | leftSide.ChildrenFiltered("dl").Children().Each(func(i int, s *goquery.Selection) { | ||
69 | switch goquery.NodeName(s) { | ||
70 | case "dt": | ||
71 | key = s.Text() | ||
72 | case "dd": | ||
73 | c[key] = s.Text() | ||
74 | } | ||
75 | }) | ||
76 | apps := make([]int, 0) | ||
77 | rightSide.Find(".tile3top").Each(func(i int, s *goquery.Selection) { | ||
78 | val, _ = s.Find("a").Attr("href") | ||
79 | id, _ := strconv.Atoi(strings.Split(val, "=")[1]) | ||
80 | apps = append(apps, id) | ||
81 | }) | ||
82 | c["__appearances"] = apps | ||
83 | return c, nil | ||
84 | } | ||
diff --git a/tools/charcrawler/crawler/acd_anime.go b/tools/charcrawler/crawler/acd_anime.go new file mode 100644 index 0000000..c697b43 --- /dev/null +++ b/tools/charcrawler/crawler/acd_anime.go | |||
@@ -0,0 +1,44 @@ | |||
1 | package crawler | ||
2 | |||
3 | import ( | ||
4 | "log" | ||
5 | "strconv" | ||
6 | "strings" | ||
7 | |||
8 | "github.com/PuerkitoBio/goquery" | ||
9 | ) | ||
10 | |||
11 | type ACDAnimeCrawler struct{} | ||
12 | |||
13 | func (a ACDAnimeCrawler) Name() string { | ||
14 | return "ACDAnime" | ||
15 | } | ||
16 | |||
17 | func (a ACDAnimeCrawler) Crawl(id int) (CharacterData, error) { | ||
18 | c := make(CharacterData) | ||
19 | doc, err := goquery.NewDocument("http://www.animecharactersdatabase.com/series.php?id=" + strconv.Itoa(id)) | ||
20 | if err != nil { | ||
21 | log.Println(err) | ||
22 | return nil, CrawlError | ||
23 | } | ||
24 | text := doc.Text() | ||
25 | if strings.Contains(text, "bad series : try") { | ||
26 | return nil, CharacterNotFound | ||
27 | } | ||
28 | if strings.Contains(text, "Your IP has been blocked") { | ||
29 | return nil, Banned | ||
30 | } | ||
31 | tds := doc.Find("#besttable > table > tbody > tr > td") | ||
32 | val, _ := tds.Eq(0).Find("img").Attr("src") | ||
33 | c["__thumb"] = val | ||
34 | tds.Eq(1).Find("tr").Each(func(i int, s *goquery.Selection) { | ||
35 | key := s.Find("th").Text() | ||
36 | value := s.Find("td").Text() | ||
37 | if key == "Home Page" { | ||
38 | val, _ = s.Find("td > a").Attr("href") | ||
39 | value = val | ||
40 | } | ||
41 | c[key] = value | ||
42 | }) | ||
43 | return c, nil | ||
44 | } | ||
diff --git a/tools/charcrawler/crawler/anidb.go b/tools/charcrawler/crawler/anidb.go new file mode 100644 index 0000000..12bf05b --- /dev/null +++ b/tools/charcrawler/crawler/anidb.go | |||
@@ -0,0 +1,69 @@ | |||
1 | package crawler | ||
2 | |||
3 | import ( | ||
4 | "log" | ||
5 | "strconv" | ||
6 | "strings" | ||
7 | |||
8 | "github.com/PuerkitoBio/goquery" | ||
9 | ) | ||
10 | |||
11 | type AniDBCrawler struct{} | ||
12 | |||
13 | func (a AniDBCrawler) Name() string { | ||
14 | return "AniDB" | ||
15 | } | ||
16 | |||
17 | func (a AniDBCrawler) Crawl(id int) (CharacterData, error) { | ||
18 | c := make(CharacterData) | ||
19 | doc, err := goquery.NewDocument("http://anidb.net/perl-bin/animedb.pl?show=character&charid=" + strconv.Itoa(id)) | ||
20 | if err != nil { | ||
21 | log.Println(err) | ||
22 | return nil, CrawlError | ||
23 | } | ||
24 | siteErr, _ := doc.Find(".error p").Html() | ||
25 | if strings.Contains(siteErr, "Unknown character id") { | ||
26 | return nil, CharacterNotFound | ||
27 | } else if strings.Contains(siteErr, "BANNED") { | ||
28 | return nil, CrawlError | ||
29 | } | ||
30 | first := true | ||
31 | links := make([]string, 0) | ||
32 | doc.Find(".characters").Each(func(i int, s *goquery.Selection) { | ||
33 | if first { | ||
34 | s.Find(".g_definitionlist tr").Each(func(i int, s *goquery.Selection) { | ||
35 | keyHtml, _ := s.Find("th").Html() | ||
36 | valueHtml, _ := s.Find("td").Html() | ||
37 | key := strings.TrimSpace(keyHtml) | ||
38 | value := strings.TrimSpace(valueHtml) | ||
39 | c[key] = value | ||
40 | }) | ||
41 | desc, _ := s.Find(".desc").Html() | ||
42 | c["__desc"] = strings.TrimSpace(desc) | ||
43 | imgUrl, _ := s.Find(".image img").Attr("src") | ||
44 | c["__img"] = imgUrl | ||
45 | animes := make([]map[string]string, 0) | ||
46 | s.Find(".animelist tr").Each(func(i int, s *goquery.Selection) { | ||
47 | if s.HasClass("header") { | ||
48 | return | ||
49 | } | ||
50 | anime := make(map[string]string) | ||
51 | relHtml, _ := s.Find(".relation").Html() | ||
52 | nameHtml, _ := s.Find(".name a").Html() | ||
53 | rel := strings.TrimSpace(relHtml) | ||
54 | name := strings.TrimSpace(nameHtml) | ||
55 | anime["rel"] = rel | ||
56 | anime["name"] = name | ||
57 | animes = append(animes, anime) | ||
58 | }) | ||
59 | c["__animes"] = animes | ||
60 | } else { | ||
61 | link, _ := s.Find(".mainname a").Attr("href") | ||
62 | links = append(links, strings.Replace(link, "http://anidb.net/ch", "", 1)) | ||
63 | } | ||
64 | first = false | ||
65 | }) | ||
66 | c["__links"] = links | ||
67 | |||
68 | return c, nil | ||
69 | } | ||
diff --git a/tools/charcrawler/crawler/crawler.go b/tools/charcrawler/crawler/crawler.go new file mode 100644 index 0000000..3d27ea2 --- /dev/null +++ b/tools/charcrawler/crawler/crawler.go | |||
@@ -0,0 +1,70 @@ | |||
1 | package crawler | ||
2 | |||
3 | import ( | ||
4 | "encoding/json" | ||
5 | "errors" | ||
6 | "io/ioutil" | ||
7 | "log" | ||
8 | "os" | ||
9 | "strconv" | ||
10 | "strings" | ||
11 | "time" | ||
12 | ) | ||
13 | |||
14 | type CharacterData map[string]interface{} | ||
15 | |||
16 | type Crawler interface { | ||
17 | Name() string | ||
18 | Crawl(id int) (CharacterData, error) | ||
19 | } | ||
20 | |||
21 | var Instances []Crawler | ||
22 | |||
23 | var ( | ||
24 | CrawlError = errors.New("Error while crawling") | ||
25 | CharacterNotFound = errors.New("Character not found") | ||
26 | Banned = errors.New("Crawler banned from source") | ||
27 | ) | ||
28 | |||
29 | func Start(c Crawler) { | ||
30 | name := c.Name() | ||
31 | os.MkdirAll("data/"+name, 0755) | ||
32 | log.Printf("Starting Crawler %s...", name) | ||
33 | ticker := time.NewTicker(time.Second * 1) | ||
34 | current := 1 | ||
35 | if save, err := ioutil.ReadFile(name + ".txt"); err == nil { | ||
36 | s := strings.TrimSpace(string(save)) | ||
37 | if i, err := strconv.Atoi(s); err == nil { | ||
38 | current = i | ||
39 | } | ||
40 | } | ||
41 | faultCounter := 0 | ||
42 | for range ticker.C { | ||
43 | if faultCounter > 100 { | ||
44 | faultCounter = 0 | ||
45 | log.Printf("[%s] Exiting after 100 fails", name) | ||
46 | break | ||
47 | } | ||
48 | log.Printf("[%s] Crawling %d", name, current) | ||
49 | char, err := c.Crawl(current) | ||
50 | switch err { | ||
51 | case CharacterNotFound: | ||
52 | log.Printf("[%s] Char %d not found!", name, current) | ||
53 | faultCounter++ | ||
54 | case CrawlError: | ||
55 | panic(err) | ||
56 | case Banned: | ||
57 | panic(err) | ||
58 | default: | ||
59 | cData, _ := json.Marshal(char) | ||
60 | ioutil.WriteFile("data/"+name+"/"+strconv.Itoa(current)+".json", cData, 0755) | ||
61 | } | ||
62 | |||
63 | current++ | ||
64 | ioutil.WriteFile(name+".txt", []byte(strconv.Itoa(current)), os.ModePerm) | ||
65 | } | ||
66 | } | ||
67 | |||
68 | func init() { | ||
69 | Instances = append(Instances, new(ACDCrawler), new(ACDAnimeCrawler)) | ||
70 | } | ||
diff --git a/tools/charcrawler/main.go b/tools/charcrawler/main.go new file mode 100644 index 0000000..a87c13c --- /dev/null +++ b/tools/charcrawler/main.go | |||
@@ -0,0 +1,16 @@ | |||
1 | package main | ||
2 | |||
3 | import ( | ||
4 | "time" | ||
5 | |||
6 | "fagott.pw/grilist/tools/charcrawler/crawler" | ||
7 | ) | ||
8 | |||
9 | func main() { | ||
10 | for _, c := range crawler.Instances { | ||
11 | go crawler.Start(c) | ||
12 | } | ||
13 | for { | ||
14 | time.Sleep(time.Minute) | ||
15 | } | ||
16 | } | ||