fix: restore live watch-order scraping
This commit is contained in:
@@ -16,7 +16,6 @@ import (
|
||||
"mal/internal/features/auth"
|
||||
"mal/internal/jikan"
|
||||
"mal/internal/server"
|
||||
"mal/internal/watchorder"
|
||||
"mal/internal/worker"
|
||||
)
|
||||
|
||||
@@ -40,22 +39,7 @@ func main() {
|
||||
|
||||
queries := database.New(db)
|
||||
authService := auth.NewService(queries)
|
||||
|
||||
watchOrderFile := os.Getenv("WATCH_ORDER_FILE")
|
||||
if watchOrderFile == "" {
|
||||
watchOrderFile = "./data/watch_order.json"
|
||||
}
|
||||
|
||||
watchOrderStore := watchorder.EmptyStore()
|
||||
loadedStore, err := watchorder.LoadFromFile(watchOrderFile)
|
||||
if err != nil {
|
||||
log.Printf("watch-order: failed to load %s: %v", watchOrderFile, err)
|
||||
} else {
|
||||
watchOrderStore = loadedStore
|
||||
log.Printf("watch-order: loaded %d entries from %s", watchOrderStore.Len(), watchOrderFile)
|
||||
}
|
||||
|
||||
jikanClient := jikan.NewClient(queries, watchOrderStore)
|
||||
jikanClient := jikan.NewClient(queries)
|
||||
|
||||
// Start background workers
|
||||
relationsWorker := worker.New(queries, jikanClient)
|
||||
|
||||
@@ -1,243 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
|
||||
"mal/internal/watchorder"
|
||||
)
|
||||
|
||||
const defaultUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
|
||||
|
||||
var idPattern = regexp.MustCompile(`/id/(\d+)`)
|
||||
|
||||
type seedPayload struct {
|
||||
IDs []int `json:"ids"`
|
||||
}
|
||||
|
||||
type outputPayload struct {
|
||||
Data map[string][]watchorder.WatchOrderEntry `json:"data"`
|
||||
}
|
||||
|
||||
func parseRootID(url string) (int, error) {
|
||||
match := idPattern.FindStringSubmatch(url)
|
||||
if len(match) != 2 {
|
||||
return 0, fmt.Errorf("invalid watch-order url: %s", url)
|
||||
}
|
||||
|
||||
id, err := strconv.Atoi(match[1])
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("invalid watch-order id in url %s: %w", url, err)
|
||||
}
|
||||
|
||||
return id, nil
|
||||
}
|
||||
|
||||
func fetchDocument(ctx context.Context, client *http.Client, url string) (*goquery.Document, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", defaultUserAgent)
|
||||
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8")
|
||||
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
||||
req.Header.Set("Referer", "https://chiaki.site/")
|
||||
req.Header.Set("Cache-Control", "no-cache")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
return goquery.NewDocumentFromReader(resp.Body)
|
||||
}
|
||||
|
||||
func parseRows(doc *goquery.Document) []watchorder.WatchOrderEntry {
|
||||
entries := make([]watchorder.WatchOrderEntry, 0)
|
||||
|
||||
doc.Find("tr[data-id]").Each(func(_ int, selection *goquery.Selection) {
|
||||
rawID, ok := selection.Attr("data-id")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
id, err := strconv.Atoi(strings.TrimSpace(rawID))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
typeLabel := ""
|
||||
rawTypeID, hasType := selection.Attr("data-type")
|
||||
if hasType {
|
||||
typeID := strings.TrimSpace(rawTypeID)
|
||||
typeLabel = mapTypeByID(doc, typeID)
|
||||
}
|
||||
|
||||
title := strings.TrimSpace(selection.Find(".wo_title").First().Text())
|
||||
titleAlt := strings.TrimSpace(selection.Find(".uk-text-small").First().Text())
|
||||
|
||||
entries = append(entries, watchorder.WatchOrderEntry{
|
||||
ID: id,
|
||||
Type: typeLabel,
|
||||
Title: title,
|
||||
TitleAlt: titleAlt,
|
||||
})
|
||||
})
|
||||
|
||||
return entries
|
||||
}
|
||||
|
||||
func mapTypeByID(doc *goquery.Document, typeID string) string {
|
||||
label := ""
|
||||
doc.Find("#wo_type_filter label").EachWithBreak(func(_ int, selection *goquery.Selection) bool {
|
||||
input := selection.Find("input[type='checkbox']")
|
||||
value, ok := input.Attr("value")
|
||||
if ok && strings.TrimSpace(value) == typeID {
|
||||
label = strings.TrimSpace(selection.Text())
|
||||
return false
|
||||
}
|
||||
return true
|
||||
})
|
||||
|
||||
return label
|
||||
}
|
||||
|
||||
func parseIDList(value string) ([]int, error) {
|
||||
if strings.TrimSpace(value) == "" {
|
||||
return []int{}, nil
|
||||
}
|
||||
|
||||
parts := strings.Split(value, ",")
|
||||
ids := make([]int, 0, len(parts))
|
||||
for _, part := range parts {
|
||||
trimmed := strings.TrimSpace(part)
|
||||
if trimmed == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
id, err := strconv.Atoi(trimmed)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid id %q: %w", trimmed, err)
|
||||
}
|
||||
ids = append(ids, id)
|
||||
}
|
||||
|
||||
return ids, nil
|
||||
}
|
||||
|
||||
func loadSeedIDs(path string) ([]int, error) {
|
||||
if strings.TrimSpace(path) == "" {
|
||||
return []int{}, nil
|
||||
}
|
||||
|
||||
content, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
payload := seedPayload{}
|
||||
if err := json.Unmarshal(content, &payload); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return payload.IDs, nil
|
||||
}
|
||||
|
||||
func sortAndUnique(ids []int) []int {
|
||||
seen := make(map[int]bool)
|
||||
unique := make([]int, 0, len(ids))
|
||||
for _, id := range ids {
|
||||
if id <= 0 || seen[id] {
|
||||
continue
|
||||
}
|
||||
seen[id] = true
|
||||
unique = append(unique, id)
|
||||
}
|
||||
|
||||
sort.Ints(unique)
|
||||
return unique
|
||||
}
|
||||
|
||||
func main() {
|
||||
outputPath := flag.String("out", "data/watch_order.json", "output json file path")
|
||||
seedPath := flag.String("seed", "tmp/watch_order_seed_ids.json", "seed json file path with {\"ids\": [...]} ")
|
||||
idList := flag.String("ids", "", "comma-separated MAL ids")
|
||||
flag.Parse()
|
||||
|
||||
idsFromFlag, err := parseIDList(*idList)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
idsFromSeed, err := loadSeedIDs(*seedPath)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error: failed to load seed ids: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
allIDs := sortAndUnique(append(idsFromSeed, idsFromFlag...))
|
||||
if len(allIDs) == 0 {
|
||||
fmt.Fprintln(os.Stderr, "error: no ids provided (use -seed and/or -ids)")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
httpClient := &http.Client{Timeout: 12 * time.Second}
|
||||
ctx := context.Background()
|
||||
|
||||
data := make(map[string][]watchorder.WatchOrderEntry, len(allIDs))
|
||||
for _, id := range allIDs {
|
||||
url := fmt.Sprintf("https://chiaki.site/?/tools/watch_order/id/%d", id)
|
||||
if _, err := parseRootID(url); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
doc, err := fetchDocument(ctx, httpClient, url)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if doc.Find("#wo_list").Length() == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
data[strconv.Itoa(id)] = parseRows(doc)
|
||||
}
|
||||
|
||||
encoded, err := json.Marshal(outputPayload{Data: data})
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error: failed to encode output: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
outputDirectory := filepath.Dir(*outputPath)
|
||||
if err := os.MkdirAll(outputDirectory, 0o755); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error: failed to create data directory: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(*outputPath, encoded, 0o644); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error: failed to write output %q: %v\n", *outputPath, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
fmt.Printf("wrote watch-order dataset for %d ids to %s\n", len(data), *outputPath)
|
||||
}
|
||||
Reference in New Issue
Block a user