fix: restore live watch-order scraping

This commit is contained in:
2026-04-11 22:46:15 +02:00
parent 10321195aa
commit 30d75eeced
11 changed files with 669 additions and 430 deletions

View File

@@ -16,7 +16,6 @@ import (
"mal/internal/features/auth"
"mal/internal/jikan"
"mal/internal/server"
"mal/internal/watchorder"
"mal/internal/worker"
)
@@ -40,22 +39,7 @@ func main() {
queries := database.New(db)
authService := auth.NewService(queries)
watchOrderFile := os.Getenv("WATCH_ORDER_FILE")
if watchOrderFile == "" {
watchOrderFile = "./data/watch_order.json"
}
watchOrderStore := watchorder.EmptyStore()
loadedStore, err := watchorder.LoadFromFile(watchOrderFile)
if err != nil {
log.Printf("watch-order: failed to load %s: %v", watchOrderFile, err)
} else {
watchOrderStore = loadedStore
log.Printf("watch-order: loaded %d entries from %s", watchOrderStore.Len(), watchOrderFile)
}
jikanClient := jikan.NewClient(queries, watchOrderStore)
jikanClient := jikan.NewClient(queries)
// Start background workers
relationsWorker := worker.New(queries, jikanClient)

View File

@@ -1,243 +0,0 @@
package main
import (
"context"
"encoding/json"
"flag"
"fmt"
"net/http"
"os"
"path/filepath"
"regexp"
"sort"
"strconv"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"mal/internal/watchorder"
)
const defaultUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
var idPattern = regexp.MustCompile(`/id/(\d+)`)
type seedPayload struct {
IDs []int `json:"ids"`
}
type outputPayload struct {
Data map[string][]watchorder.WatchOrderEntry `json:"data"`
}
func parseRootID(url string) (int, error) {
match := idPattern.FindStringSubmatch(url)
if len(match) != 2 {
return 0, fmt.Errorf("invalid watch-order url: %s", url)
}
id, err := strconv.Atoi(match[1])
if err != nil {
return 0, fmt.Errorf("invalid watch-order id in url %s: %w", url, err)
}
return id, nil
}
func fetchDocument(ctx context.Context, client *http.Client, url string) (*goquery.Document, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", defaultUserAgent)
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8")
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
req.Header.Set("Referer", "https://chiaki.site/")
req.Header.Set("Cache-Control", "no-cache")
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("status %d", resp.StatusCode)
}
return goquery.NewDocumentFromReader(resp.Body)
}
func parseRows(doc *goquery.Document) []watchorder.WatchOrderEntry {
entries := make([]watchorder.WatchOrderEntry, 0)
doc.Find("tr[data-id]").Each(func(_ int, selection *goquery.Selection) {
rawID, ok := selection.Attr("data-id")
if !ok {
return
}
id, err := strconv.Atoi(strings.TrimSpace(rawID))
if err != nil {
return
}
typeLabel := ""
rawTypeID, hasType := selection.Attr("data-type")
if hasType {
typeID := strings.TrimSpace(rawTypeID)
typeLabel = mapTypeByID(doc, typeID)
}
title := strings.TrimSpace(selection.Find(".wo_title").First().Text())
titleAlt := strings.TrimSpace(selection.Find(".uk-text-small").First().Text())
entries = append(entries, watchorder.WatchOrderEntry{
ID: id,
Type: typeLabel,
Title: title,
TitleAlt: titleAlt,
})
})
return entries
}
func mapTypeByID(doc *goquery.Document, typeID string) string {
label := ""
doc.Find("#wo_type_filter label").EachWithBreak(func(_ int, selection *goquery.Selection) bool {
input := selection.Find("input[type='checkbox']")
value, ok := input.Attr("value")
if ok && strings.TrimSpace(value) == typeID {
label = strings.TrimSpace(selection.Text())
return false
}
return true
})
return label
}
func parseIDList(value string) ([]int, error) {
if strings.TrimSpace(value) == "" {
return []int{}, nil
}
parts := strings.Split(value, ",")
ids := make([]int, 0, len(parts))
for _, part := range parts {
trimmed := strings.TrimSpace(part)
if trimmed == "" {
continue
}
id, err := strconv.Atoi(trimmed)
if err != nil {
return nil, fmt.Errorf("invalid id %q: %w", trimmed, err)
}
ids = append(ids, id)
}
return ids, nil
}
func loadSeedIDs(path string) ([]int, error) {
if strings.TrimSpace(path) == "" {
return []int{}, nil
}
content, err := os.ReadFile(path)
if err != nil {
return nil, err
}
payload := seedPayload{}
if err := json.Unmarshal(content, &payload); err != nil {
return nil, err
}
return payload.IDs, nil
}
func sortAndUnique(ids []int) []int {
seen := make(map[int]bool)
unique := make([]int, 0, len(ids))
for _, id := range ids {
if id <= 0 || seen[id] {
continue
}
seen[id] = true
unique = append(unique, id)
}
sort.Ints(unique)
return unique
}
func main() {
outputPath := flag.String("out", "data/watch_order.json", "output json file path")
seedPath := flag.String("seed", "tmp/watch_order_seed_ids.json", "seed json file path with {\"ids\": [...]} ")
idList := flag.String("ids", "", "comma-separated MAL ids")
flag.Parse()
idsFromFlag, err := parseIDList(*idList)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(1)
}
idsFromSeed, err := loadSeedIDs(*seedPath)
if err != nil {
fmt.Fprintf(os.Stderr, "error: failed to load seed ids: %v\n", err)
os.Exit(1)
}
allIDs := sortAndUnique(append(idsFromSeed, idsFromFlag...))
if len(allIDs) == 0 {
fmt.Fprintln(os.Stderr, "error: no ids provided (use -seed and/or -ids)")
os.Exit(1)
}
httpClient := &http.Client{Timeout: 12 * time.Second}
ctx := context.Background()
data := make(map[string][]watchorder.WatchOrderEntry, len(allIDs))
for _, id := range allIDs {
url := fmt.Sprintf("https://chiaki.site/?/tools/watch_order/id/%d", id)
if _, err := parseRootID(url); err != nil {
continue
}
doc, err := fetchDocument(ctx, httpClient, url)
if err != nil {
continue
}
if doc.Find("#wo_list").Length() == 0 {
continue
}
data[strconv.Itoa(id)] = parseRows(doc)
}
encoded, err := json.Marshal(outputPayload{Data: data})
if err != nil {
fmt.Fprintf(os.Stderr, "error: failed to encode output: %v\n", err)
os.Exit(1)
}
outputDirectory := filepath.Dir(*outputPath)
if err := os.MkdirAll(outputDirectory, 0o755); err != nil {
fmt.Fprintf(os.Stderr, "error: failed to create data directory: %v\n", err)
os.Exit(1)
}
if err := os.WriteFile(*outputPath, encoded, 0o644); err != nil {
fmt.Fprintf(os.Stderr, "error: failed to write output %q: %v\n", *outputPath, err)
os.Exit(1)
}
fmt.Printf("wrote watch-order dataset for %d ids to %s\n", len(data), *outputPath)
}