fix: restore live watch-order scraping
This commit is contained in:
@@ -32,7 +32,6 @@ RUN mkdir -p /app/data
|
||||
COPY --from=builder /app/main_server .
|
||||
COPY --from=builder /app/static ./static
|
||||
COPY --from=builder /app/migrations ./migrations
|
||||
COPY --from=builder /app/data ./data
|
||||
|
||||
# Expose the application port
|
||||
EXPOSE 3000
|
||||
|
||||
@@ -16,7 +16,6 @@ import (
|
||||
"mal/internal/features/auth"
|
||||
"mal/internal/jikan"
|
||||
"mal/internal/server"
|
||||
"mal/internal/watchorder"
|
||||
"mal/internal/worker"
|
||||
)
|
||||
|
||||
@@ -40,22 +39,7 @@ func main() {
|
||||
|
||||
queries := database.New(db)
|
||||
authService := auth.NewService(queries)
|
||||
|
||||
watchOrderFile := os.Getenv("WATCH_ORDER_FILE")
|
||||
if watchOrderFile == "" {
|
||||
watchOrderFile = "./data/watch_order.json"
|
||||
}
|
||||
|
||||
watchOrderStore := watchorder.EmptyStore()
|
||||
loadedStore, err := watchorder.LoadFromFile(watchOrderFile)
|
||||
if err != nil {
|
||||
log.Printf("watch-order: failed to load %s: %v", watchOrderFile, err)
|
||||
} else {
|
||||
watchOrderStore = loadedStore
|
||||
log.Printf("watch-order: loaded %d entries from %s", watchOrderStore.Len(), watchOrderFile)
|
||||
}
|
||||
|
||||
jikanClient := jikan.NewClient(queries, watchOrderStore)
|
||||
jikanClient := jikan.NewClient(queries)
|
||||
|
||||
// Start background workers
|
||||
relationsWorker := worker.New(queries, jikanClient)
|
||||
|
||||
@@ -1,243 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
|
||||
"mal/internal/watchorder"
|
||||
)
|
||||
|
||||
const defaultUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
|
||||
|
||||
var idPattern = regexp.MustCompile(`/id/(\d+)`)
|
||||
|
||||
type seedPayload struct {
|
||||
IDs []int `json:"ids"`
|
||||
}
|
||||
|
||||
type outputPayload struct {
|
||||
Data map[string][]watchorder.WatchOrderEntry `json:"data"`
|
||||
}
|
||||
|
||||
func parseRootID(url string) (int, error) {
|
||||
match := idPattern.FindStringSubmatch(url)
|
||||
if len(match) != 2 {
|
||||
return 0, fmt.Errorf("invalid watch-order url: %s", url)
|
||||
}
|
||||
|
||||
id, err := strconv.Atoi(match[1])
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("invalid watch-order id in url %s: %w", url, err)
|
||||
}
|
||||
|
||||
return id, nil
|
||||
}
|
||||
|
||||
func fetchDocument(ctx context.Context, client *http.Client, url string) (*goquery.Document, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", defaultUserAgent)
|
||||
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8")
|
||||
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
||||
req.Header.Set("Referer", "https://chiaki.site/")
|
||||
req.Header.Set("Cache-Control", "no-cache")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
return goquery.NewDocumentFromReader(resp.Body)
|
||||
}
|
||||
|
||||
func parseRows(doc *goquery.Document) []watchorder.WatchOrderEntry {
|
||||
entries := make([]watchorder.WatchOrderEntry, 0)
|
||||
|
||||
doc.Find("tr[data-id]").Each(func(_ int, selection *goquery.Selection) {
|
||||
rawID, ok := selection.Attr("data-id")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
id, err := strconv.Atoi(strings.TrimSpace(rawID))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
typeLabel := ""
|
||||
rawTypeID, hasType := selection.Attr("data-type")
|
||||
if hasType {
|
||||
typeID := strings.TrimSpace(rawTypeID)
|
||||
typeLabel = mapTypeByID(doc, typeID)
|
||||
}
|
||||
|
||||
title := strings.TrimSpace(selection.Find(".wo_title").First().Text())
|
||||
titleAlt := strings.TrimSpace(selection.Find(".uk-text-small").First().Text())
|
||||
|
||||
entries = append(entries, watchorder.WatchOrderEntry{
|
||||
ID: id,
|
||||
Type: typeLabel,
|
||||
Title: title,
|
||||
TitleAlt: titleAlt,
|
||||
})
|
||||
})
|
||||
|
||||
return entries
|
||||
}
|
||||
|
||||
func mapTypeByID(doc *goquery.Document, typeID string) string {
|
||||
label := ""
|
||||
doc.Find("#wo_type_filter label").EachWithBreak(func(_ int, selection *goquery.Selection) bool {
|
||||
input := selection.Find("input[type='checkbox']")
|
||||
value, ok := input.Attr("value")
|
||||
if ok && strings.TrimSpace(value) == typeID {
|
||||
label = strings.TrimSpace(selection.Text())
|
||||
return false
|
||||
}
|
||||
return true
|
||||
})
|
||||
|
||||
return label
|
||||
}
|
||||
|
||||
func parseIDList(value string) ([]int, error) {
|
||||
if strings.TrimSpace(value) == "" {
|
||||
return []int{}, nil
|
||||
}
|
||||
|
||||
parts := strings.Split(value, ",")
|
||||
ids := make([]int, 0, len(parts))
|
||||
for _, part := range parts {
|
||||
trimmed := strings.TrimSpace(part)
|
||||
if trimmed == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
id, err := strconv.Atoi(trimmed)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid id %q: %w", trimmed, err)
|
||||
}
|
||||
ids = append(ids, id)
|
||||
}
|
||||
|
||||
return ids, nil
|
||||
}
|
||||
|
||||
func loadSeedIDs(path string) ([]int, error) {
|
||||
if strings.TrimSpace(path) == "" {
|
||||
return []int{}, nil
|
||||
}
|
||||
|
||||
content, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
payload := seedPayload{}
|
||||
if err := json.Unmarshal(content, &payload); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return payload.IDs, nil
|
||||
}
|
||||
|
||||
func sortAndUnique(ids []int) []int {
|
||||
seen := make(map[int]bool)
|
||||
unique := make([]int, 0, len(ids))
|
||||
for _, id := range ids {
|
||||
if id <= 0 || seen[id] {
|
||||
continue
|
||||
}
|
||||
seen[id] = true
|
||||
unique = append(unique, id)
|
||||
}
|
||||
|
||||
sort.Ints(unique)
|
||||
return unique
|
||||
}
|
||||
|
||||
func main() {
|
||||
outputPath := flag.String("out", "data/watch_order.json", "output json file path")
|
||||
seedPath := flag.String("seed", "tmp/watch_order_seed_ids.json", "seed json file path with {\"ids\": [...]} ")
|
||||
idList := flag.String("ids", "", "comma-separated MAL ids")
|
||||
flag.Parse()
|
||||
|
||||
idsFromFlag, err := parseIDList(*idList)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
idsFromSeed, err := loadSeedIDs(*seedPath)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error: failed to load seed ids: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
allIDs := sortAndUnique(append(idsFromSeed, idsFromFlag...))
|
||||
if len(allIDs) == 0 {
|
||||
fmt.Fprintln(os.Stderr, "error: no ids provided (use -seed and/or -ids)")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
httpClient := &http.Client{Timeout: 12 * time.Second}
|
||||
ctx := context.Background()
|
||||
|
||||
data := make(map[string][]watchorder.WatchOrderEntry, len(allIDs))
|
||||
for _, id := range allIDs {
|
||||
url := fmt.Sprintf("https://chiaki.site/?/tools/watch_order/id/%d", id)
|
||||
if _, err := parseRootID(url); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
doc, err := fetchDocument(ctx, httpClient, url)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if doc.Find("#wo_list").Length() == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
data[strconv.Itoa(id)] = parseRows(doc)
|
||||
}
|
||||
|
||||
encoded, err := json.Marshal(outputPayload{Data: data})
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error: failed to encode output: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
outputDirectory := filepath.Dir(*outputPath)
|
||||
if err := os.MkdirAll(outputDirectory, 0o755); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error: failed to create data directory: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(*outputPath, encoded, 0o644); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error: failed to write output %q: %v\n", *outputPath, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
fmt.Printf("wrote watch-order dataset for %d ids to %s\n", len(data), *outputPath)
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
@@ -9,28 +9,21 @@ import (
|
||||
"time"
|
||||
|
||||
"mal/internal/database"
|
||||
"mal/internal/watchorder"
|
||||
)
|
||||
|
||||
type Client struct {
|
||||
httpClient *http.Client
|
||||
baseURL string
|
||||
db database.Querier
|
||||
watchOrders *watchorder.Store
|
||||
mu sync.Mutex
|
||||
lastReqTime time.Time
|
||||
}
|
||||
|
||||
func NewClient(db database.Querier, watchOrders *watchorder.Store) *Client {
|
||||
if watchOrders == nil {
|
||||
watchOrders = watchorder.EmptyStore()
|
||||
}
|
||||
|
||||
func NewClient(db database.Querier) *Client {
|
||||
return &Client{
|
||||
httpClient: &http.Client{Timeout: 10 * time.Second},
|
||||
baseURL: "https://api.jikan.moe/v4",
|
||||
db: db,
|
||||
watchOrders: watchOrders,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,10 +2,17 @@ package jikan
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"mal/internal/watchorder"
|
||||
)
|
||||
|
||||
const chiakiWatchOrderURL = "https://chiaki.site/?/tools/watch_order/id/%d"
|
||||
const watchOrderCacheTTL = time.Hour * 24
|
||||
const maxWatchOrderEntries = 120
|
||||
|
||||
func watchOrderTypeLabel(value string) string {
|
||||
@@ -25,6 +32,49 @@ func isAllowedWatchOrderType(value string) bool {
|
||||
return normalized == "tv" || normalized == "movie"
|
||||
}
|
||||
|
||||
func relationCacheKey(id int) string {
|
||||
return fmt.Sprintf("relations:watch-order:%d", id)
|
||||
}
|
||||
|
||||
func (c *Client) getWatchOrder(ctx context.Context, id int) (watchorder.WatchOrderResult, error) {
|
||||
cacheKey := relationCacheKey(id)
|
||||
|
||||
var cached watchorder.WatchOrderResult
|
||||
if c.getCache(ctx, cacheKey, &cached) {
|
||||
return cached, nil
|
||||
}
|
||||
|
||||
watchOrderURL := fmt.Sprintf(chiakiWatchOrderURL, id)
|
||||
requestCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
result, err := watchorder.FetchWatchOrder(requestCtx, c.httpClient, watchOrderURL)
|
||||
if err != nil {
|
||||
var statusError *watchorder.HTTPStatusError
|
||||
if errors.Is(err, watchorder.ErrWatchOrderMarkupNotFound) {
|
||||
log.Printf("relations: watch-order markup missing for %d (%s): %v", id, watchOrderURL, err)
|
||||
} else if errors.As(err, &statusError) {
|
||||
log.Printf(
|
||||
"relations: watch-order http error for %d (%s): status=%d server=%q cf_ray=%q location=%q content_type=%q body=%q",
|
||||
id,
|
||||
watchOrderURL,
|
||||
statusError.StatusCode,
|
||||
statusError.Server,
|
||||
statusError.CFRay,
|
||||
statusError.Location,
|
||||
statusError.ContentType,
|
||||
statusError.BodyPreview,
|
||||
)
|
||||
} else {
|
||||
log.Printf("relations: watch-order fetch failed for %d (%s): %v", id, watchOrderURL, err)
|
||||
}
|
||||
return watchorder.WatchOrderResult{}, err
|
||||
}
|
||||
|
||||
c.setCache(ctx, cacheKey, result, watchOrderCacheTTL)
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (c *Client) currentOnlyRelation(ctx context.Context, id int) ([]RelationEntry, error) {
|
||||
currentAnime, err := c.GetAnimeByID(ctx, id)
|
||||
if err != nil {
|
||||
@@ -40,16 +90,16 @@ func (c *Client) currentOnlyRelation(ctx context.Context, id int) ([]RelationEnt
|
||||
}
|
||||
|
||||
func (c *Client) GetFullRelations(ctx context.Context, id int) ([]RelationEntry, error) {
|
||||
watchOrder, found := c.watchOrders.Get(id)
|
||||
if !found {
|
||||
log.Printf("relations: no local watch-order data for %d", id)
|
||||
result, err := c.getWatchOrder(ctx, id)
|
||||
if err != nil {
|
||||
log.Printf("relations: using current-only fallback for %d: %v", id, err)
|
||||
return c.currentOnlyRelation(ctx, id)
|
||||
}
|
||||
|
||||
seen := make(map[int]bool)
|
||||
relations := make([]RelationEntry, 0, len(watchOrder)+1)
|
||||
relations := make([]RelationEntry, 0, len(result.WatchOrder)+1)
|
||||
|
||||
for _, watchOrderEntry := range watchOrder {
|
||||
for _, watchOrderEntry := range result.WatchOrder {
|
||||
if len(relations) >= maxWatchOrderEntries {
|
||||
break
|
||||
}
|
||||
|
||||
@@ -1,72 +0,0 @@
|
||||
package watchorder
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
type Store struct {
|
||||
byID map[int][]WatchOrderEntry
|
||||
}
|
||||
|
||||
func EmptyStore() *Store {
|
||||
return &Store{byID: make(map[int][]WatchOrderEntry)}
|
||||
}
|
||||
|
||||
func (s *Store) Len() int {
|
||||
if s == nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
return len(s.byID)
|
||||
}
|
||||
|
||||
func (s *Store) Get(id int) ([]WatchOrderEntry, bool) {
|
||||
if s == nil {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
entries, ok := s.byID[id]
|
||||
if !ok {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
return entries, true
|
||||
}
|
||||
|
||||
func LoadFromFile(path string) (*Store, error) {
|
||||
content, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read watch-order file %q: %w", path, err)
|
||||
}
|
||||
|
||||
rawMessages := make(map[string]json.RawMessage)
|
||||
if err := json.Unmarshal(content, &rawMessages); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse watch-order file %q: %w", path, err)
|
||||
}
|
||||
|
||||
raw := make(map[string][]WatchOrderEntry)
|
||||
if wrappedData, ok := rawMessages["data"]; ok && len(rawMessages) == 1 {
|
||||
if err := json.Unmarshal(wrappedData, &raw); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse watch-order data in file %q: %w", path, err)
|
||||
}
|
||||
} else {
|
||||
if err := json.Unmarshal(content, &raw); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse watch-order file %q: %w", path, err)
|
||||
}
|
||||
}
|
||||
|
||||
byID := make(map[int][]WatchOrderEntry, len(raw))
|
||||
for key, entries := range raw {
|
||||
id, err := strconv.Atoi(key)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid anime id key %q in watch-order file %q: %w", key, path, err)
|
||||
}
|
||||
|
||||
byID[id] = entries
|
||||
}
|
||||
|
||||
return &Store{byID: byID}, nil
|
||||
}
|
||||
@@ -1,72 +0,0 @@
|
||||
package watchorder
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestLoadFromFile_Success(t *testing.T) {
|
||||
temporaryDirectory := t.TempDir()
|
||||
filePath := filepath.Join(temporaryDirectory, "watch_order.json")
|
||||
|
||||
content := `{
|
||||
"1": [{"id": 1, "type": "TV", "title": "One"}],
|
||||
"2": [{"id": 2, "type": "Movie", "title": "Two"}]
|
||||
}`
|
||||
|
||||
if err := os.WriteFile(filePath, []byte(content), 0o644); err != nil {
|
||||
t.Fatalf("failed to write file: %v", err)
|
||||
}
|
||||
|
||||
store, err := LoadFromFile(filePath)
|
||||
if err != nil {
|
||||
t.Fatalf("expected no error, got %v", err)
|
||||
}
|
||||
|
||||
if store.Len() != 2 {
|
||||
t.Fatalf("expected 2 ids, got %d", store.Len())
|
||||
}
|
||||
|
||||
entries, ok := store.Get(1)
|
||||
if !ok {
|
||||
t.Fatalf("expected id 1 to exist")
|
||||
}
|
||||
|
||||
if len(entries) != 1 || entries[0].ID != 1 {
|
||||
t.Fatalf("unexpected entries for id 1: %+v", entries)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadFromFile_InvalidIDKey(t *testing.T) {
|
||||
temporaryDirectory := t.TempDir()
|
||||
filePath := filepath.Join(temporaryDirectory, "watch_order.json")
|
||||
|
||||
if err := os.WriteFile(filePath, []byte(`{"abc": []}`), 0o644); err != nil {
|
||||
t.Fatalf("failed to write file: %v", err)
|
||||
}
|
||||
|
||||
_, err := LoadFromFile(filePath)
|
||||
if err == nil {
|
||||
t.Fatalf("expected error for invalid id key")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadFromFile_WrappedPayload(t *testing.T) {
|
||||
temporaryDirectory := t.TempDir()
|
||||
filePath := filepath.Join(temporaryDirectory, "watch_order.json")
|
||||
|
||||
content := `{"data":{"10":[{"id":10,"type":"TV","title":"Ten"}]}}`
|
||||
if err := os.WriteFile(filePath, []byte(content), 0o644); err != nil {
|
||||
t.Fatalf("failed to write file: %v", err)
|
||||
}
|
||||
|
||||
store, err := LoadFromFile(filePath)
|
||||
if err != nil {
|
||||
t.Fatalf("expected no error, got %v", err)
|
||||
}
|
||||
|
||||
if store.Len() != 1 {
|
||||
t.Fatalf("expected 1 id, got %d", store.Len())
|
||||
}
|
||||
}
|
||||
@@ -1,8 +0,0 @@
|
||||
package watchorder
|
||||
|
||||
type WatchOrderEntry struct {
|
||||
ID int `json:"id"`
|
||||
Type string `json:"type"`
|
||||
Title string `json:"title"`
|
||||
TitleAlt string `json:"title_alt,omitempty"`
|
||||
}
|
||||
397
internal/watchorder/watch_order.go
Normal file
397
internal/watchorder/watch_order.go
Normal file
@@ -0,0 +1,397 @@
|
||||
package watchorder
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
const defaultUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
|
||||
|
||||
var idPattern = regexp.MustCompile(`/id/(\d+)`)
|
||||
var malLinkPattern = regexp.MustCompile(`myanimelist\.net/anime/(\d+)`)
|
||||
|
||||
var ErrInvalidWatchOrderURL = errors.New("invalid watch order url")
|
||||
var ErrWatchOrderMarkupNotFound = errors.New("watch order markup not found")
|
||||
|
||||
type HTTPStatusError struct {
|
||||
StatusCode int
|
||||
URL string
|
||||
Server string
|
||||
CFRay string
|
||||
Location string
|
||||
ContentType string
|
||||
BodyPreview string
|
||||
}
|
||||
|
||||
func (e *HTTPStatusError) Error() string {
|
||||
return fmt.Sprintf(
|
||||
"unexpected status code: %d (url=%s server=%s cf_ray=%s location=%s content_type=%s body=%q)",
|
||||
e.StatusCode,
|
||||
e.URL,
|
||||
e.Server,
|
||||
e.CFRay,
|
||||
e.Location,
|
||||
e.ContentType,
|
||||
e.BodyPreview,
|
||||
)
|
||||
}
|
||||
|
||||
type WatchOrderEntry struct {
|
||||
ID int `json:"id"`
|
||||
Type string `json:"type"`
|
||||
Title string `json:"title"`
|
||||
TitleAlt string `json:"title_alt,omitempty"`
|
||||
}
|
||||
|
||||
type WatchOrderResult struct {
|
||||
ID int `json:"id"`
|
||||
WatchOrder []WatchOrderEntry `json:"watch_order"`
|
||||
}
|
||||
|
||||
type watchOrderRow struct {
|
||||
id int
|
||||
typeID int
|
||||
title string
|
||||
alternativeTitle string
|
||||
}
|
||||
|
||||
func parseRootID(url string) (int, error) {
|
||||
match := idPattern.FindStringSubmatch(url)
|
||||
if len(match) != 2 {
|
||||
return 0, ErrInvalidWatchOrderURL
|
||||
}
|
||||
|
||||
id, err := strconv.Atoi(match[1])
|
||||
if err != nil {
|
||||
return 0, ErrInvalidWatchOrderURL
|
||||
}
|
||||
|
||||
return id, nil
|
||||
}
|
||||
|
||||
func addCommonHeaders(request *http.Request) {
|
||||
request.Header.Set("User-Agent", defaultUserAgent)
|
||||
request.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8")
|
||||
request.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
||||
request.Header.Set("Referer", "https://chiaki.site/")
|
||||
request.Header.Set("Cache-Control", "no-cache")
|
||||
}
|
||||
|
||||
func fetchDocument(ctx context.Context, httpClient *http.Client, url string) (*goquery.Document, error) {
|
||||
client := httpClient
|
||||
if client == nil {
|
||||
client = http.DefaultClient
|
||||
}
|
||||
|
||||
request, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create request: %w", err)
|
||||
}
|
||||
|
||||
addCommonHeaders(request)
|
||||
|
||||
response, err := client.Do(request)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("request failed: %w", err)
|
||||
}
|
||||
defer response.Body.Close()
|
||||
|
||||
if response.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(io.LimitReader(response.Body, 512))
|
||||
return nil, &HTTPStatusError{
|
||||
StatusCode: response.StatusCode,
|
||||
URL: url,
|
||||
Server: strings.TrimSpace(response.Header.Get("Server")),
|
||||
CFRay: strings.TrimSpace(response.Header.Get("CF-Ray")),
|
||||
Location: strings.TrimSpace(response.Header.Get("Location")),
|
||||
ContentType: strings.TrimSpace(response.Header.Get("Content-Type")),
|
||||
BodyPreview: strings.Join(strings.Fields(strings.TrimSpace(string(body))), " "),
|
||||
}
|
||||
}
|
||||
|
||||
document, err := goquery.NewDocumentFromReader(response.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse html: %w", err)
|
||||
}
|
||||
|
||||
return document, nil
|
||||
}
|
||||
|
||||
func extractTypeLabelsByID(doc *goquery.Document) map[int]string {
|
||||
typeLabels := make(map[int]string)
|
||||
|
||||
doc.Find("#wo_type_filter label").Each(func(_ int, selection *goquery.Selection) {
|
||||
input := selection.Find("input[type='checkbox']")
|
||||
rawID, exists := input.Attr("value")
|
||||
if !exists {
|
||||
return
|
||||
}
|
||||
|
||||
typeID, err := strconv.Atoi(strings.TrimSpace(rawID))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
label := strings.TrimSpace(selection.Text())
|
||||
if label == "" {
|
||||
return
|
||||
}
|
||||
|
||||
typeLabels[typeID] = label
|
||||
})
|
||||
|
||||
return typeLabels
|
||||
}
|
||||
|
||||
func parseAttrInt(selection *goquery.Selection, attrName string) (int, bool) {
|
||||
rawValue, exists := selection.Attr(attrName)
|
||||
if !exists {
|
||||
return 0, false
|
||||
}
|
||||
|
||||
value, err := strconv.Atoi(strings.TrimSpace(rawValue))
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
|
||||
return value, true
|
||||
}
|
||||
|
||||
func extractRows(doc *goquery.Document) []watchOrderRow {
|
||||
rows := make([]watchOrderRow, 0)
|
||||
|
||||
doc.Find("tr[data-id]").Each(func(_ int, selection *goquery.Selection) {
|
||||
id, ok := parseAttrInt(selection, "data-id")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
typeID, ok := parseAttrInt(selection, "data-type")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
title := strings.TrimSpace(selection.Find(".wo_title").First().Text())
|
||||
alternativeTitle := strings.TrimSpace(selection.Find(".uk-text-small").First().Text())
|
||||
|
||||
rows = append(rows, watchOrderRow{
|
||||
id: id,
|
||||
typeID: typeID,
|
||||
title: title,
|
||||
alternativeTitle: alternativeTitle,
|
||||
})
|
||||
})
|
||||
|
||||
return rows
|
||||
}
|
||||
|
||||
func hasWatchOrderTable(doc *goquery.Document) bool {
|
||||
return doc.Find("#wo_list").Length() > 0
|
||||
}
|
||||
|
||||
func shouldTryProxy(err error) bool {
|
||||
var statusError *HTTPStatusError
|
||||
if errors.As(err, &statusError) {
|
||||
return statusError.StatusCode == http.StatusForbidden || statusError.StatusCode == http.StatusTooManyRequests || statusError.StatusCode == http.StatusServiceUnavailable
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func toJinaProxyURL(url string) string {
|
||||
trimmed := strings.TrimPrefix(strings.TrimPrefix(url, "https://"), "http://")
|
||||
return "https://r.jina.ai/http://" + trimmed
|
||||
}
|
||||
|
||||
func fetchProxyText(ctx context.Context, httpClient *http.Client, url string) (string, error) {
|
||||
client := httpClient
|
||||
if client == nil {
|
||||
client = http.DefaultClient
|
||||
}
|
||||
|
||||
request, err := http.NewRequestWithContext(ctx, http.MethodGet, toJinaProxyURL(url), nil)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to create proxy request: %w", err)
|
||||
}
|
||||
|
||||
addCommonHeaders(request)
|
||||
|
||||
response, err := client.Do(request)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("proxy request failed: %w", err)
|
||||
}
|
||||
defer response.Body.Close()
|
||||
|
||||
if response.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("proxy status %d", response.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(io.LimitReader(response.Body, 2*1024*1024))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to read proxy response: %w", err)
|
||||
}
|
||||
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func parseJinaEntries(text string) []WatchOrderEntry {
|
||||
lines := strings.Split(text, "\n")
|
||||
entries := make([]WatchOrderEntry, 0)
|
||||
seen := make(map[int]bool)
|
||||
|
||||
for index, line := range lines {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if trimmed == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
if !strings.Contains(trimmed, "myanimelist.net/anime/") || !strings.Contains(trimmed, "|") {
|
||||
continue
|
||||
}
|
||||
|
||||
idMatch := malLinkPattern.FindStringSubmatch(trimmed)
|
||||
if len(idMatch) != 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
id, err := strconv.Atoi(idMatch[1])
|
||||
if err != nil || seen[id] {
|
||||
continue
|
||||
}
|
||||
|
||||
parts := strings.Split(trimmed, "|")
|
||||
if len(parts) < 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
typeName := strings.TrimSpace(parts[1])
|
||||
if typeName == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
title, titleAlt := titleFromContext(lines, index)
|
||||
entries = append(entries, WatchOrderEntry{
|
||||
ID: id,
|
||||
Type: typeName,
|
||||
Title: title,
|
||||
TitleAlt: titleAlt,
|
||||
})
|
||||
seen[id] = true
|
||||
}
|
||||
|
||||
return entries
|
||||
}
|
||||
|
||||
func isNoiseTitleLine(value string) bool {
|
||||
lower := strings.ToLower(strings.TrimSpace(value))
|
||||
if lower == "" {
|
||||
return true
|
||||
}
|
||||
|
||||
if strings.HasPrefix(lower, "title:") || strings.HasPrefix(lower, "url source:") || strings.HasPrefix(lower, "markdown content:") {
|
||||
return true
|
||||
}
|
||||
|
||||
if strings.Contains(lower, "/ watch order") {
|
||||
return true
|
||||
}
|
||||
|
||||
if strings.HasPrefix(lower, "http://") || strings.HasPrefix(lower, "https://") {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func titleFromContext(lines []string, metaIndex int) (string, string) {
|
||||
collected := make([]string, 0, 2)
|
||||
|
||||
for idx := metaIndex - 1; idx >= 0 && len(collected) < 2; idx-- {
|
||||
candidate := strings.TrimSpace(lines[idx])
|
||||
if candidate == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
if isNoiseTitleLine(candidate) {
|
||||
continue
|
||||
}
|
||||
|
||||
if strings.Contains(candidate, "myanimelist.net/anime/") {
|
||||
continue
|
||||
}
|
||||
|
||||
collected = append(collected, candidate)
|
||||
}
|
||||
|
||||
if len(collected) == 0 {
|
||||
return "", ""
|
||||
}
|
||||
|
||||
if len(collected) == 1 {
|
||||
return collected[0], ""
|
||||
}
|
||||
|
||||
return collected[1], collected[0]
|
||||
}
|
||||
|
||||
func fetchViaProxy(ctx context.Context, httpClient *http.Client, url string, rootID int) (WatchOrderResult, error) {
|
||||
proxyText, err := fetchProxyText(ctx, httpClient, url)
|
||||
if err != nil {
|
||||
return WatchOrderResult{}, err
|
||||
}
|
||||
|
||||
entries := parseJinaEntries(proxyText)
|
||||
if len(entries) == 0 {
|
||||
return WatchOrderResult{}, ErrWatchOrderMarkupNotFound
|
||||
}
|
||||
|
||||
return WatchOrderResult{ID: rootID, WatchOrder: entries}, nil
|
||||
}
|
||||
|
||||
func FetchWatchOrder(ctx context.Context, httpClient *http.Client, url string) (WatchOrderResult, error) {
|
||||
rootID, err := parseRootID(url)
|
||||
if err != nil {
|
||||
return WatchOrderResult{}, err
|
||||
}
|
||||
|
||||
doc, err := fetchDocument(ctx, httpClient, url)
|
||||
if err != nil {
|
||||
if shouldTryProxy(err) {
|
||||
return fetchViaProxy(ctx, httpClient, url, rootID)
|
||||
}
|
||||
return WatchOrderResult{}, err
|
||||
}
|
||||
|
||||
if !hasWatchOrderTable(doc) {
|
||||
return fetchViaProxy(ctx, httpClient, url, rootID)
|
||||
}
|
||||
|
||||
rows := extractRows(doc)
|
||||
if len(rows) == 0 {
|
||||
return WatchOrderResult{ID: rootID, WatchOrder: []WatchOrderEntry{}}, nil
|
||||
}
|
||||
|
||||
typeByID := extractTypeLabelsByID(doc)
|
||||
|
||||
entries := make([]WatchOrderEntry, 0, len(rows))
|
||||
for _, row := range rows {
|
||||
typeName := strings.TrimSpace(typeByID[row.typeID])
|
||||
|
||||
entries = append(entries, WatchOrderEntry{
|
||||
ID: row.id,
|
||||
Type: typeName,
|
||||
Title: row.title,
|
||||
TitleAlt: row.alternativeTitle,
|
||||
})
|
||||
}
|
||||
|
||||
return WatchOrderResult{ID: rootID, WatchOrder: entries}, nil
|
||||
}
|
||||
212
internal/watchorder/watch_order_test.go
Normal file
212
internal/watchorder/watch_order_test.go
Normal file
@@ -0,0 +1,212 @@
|
||||
package watchorder
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func testServer(body string) *httptest.Server {
|
||||
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
_, _ = w.Write([]byte(body))
|
||||
})
|
||||
|
||||
return httptest.NewServer(handler)
|
||||
}
|
||||
|
||||
func testHTMLWithMetadata() string {
|
||||
return `
|
||||
<!doctype html>
|
||||
<html>
|
||||
<body>
|
||||
<div id="wo_type_filter">
|
||||
<label><input type="checkbox" value="1" checked> TV</label>
|
||||
<label><input type="checkbox" value="3" checked> Movie</label>
|
||||
</div>
|
||||
<table id="wo_list">
|
||||
<tr data-id="442" data-anilist-id="442" data-type="3">
|
||||
<td>
|
||||
<span class="wo_title">Naruto Movie 1</span>
|
||||
<span class="uk-text-small">Naruto the Movie 1</span>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</body>
|
||||
</html>`
|
||||
}
|
||||
|
||||
func testHTMLEmptyRows() string {
|
||||
return `
|
||||
<!doctype html>
|
||||
<html>
|
||||
<body>
|
||||
<div id="wo_type_filter">
|
||||
<label><input type="checkbox" value="1" checked> TV</label>
|
||||
<label><input type="checkbox" value="3" checked> Movie</label>
|
||||
</div>
|
||||
<table id="wo_list"></table>
|
||||
</body>
|
||||
</html>`
|
||||
}
|
||||
|
||||
func TestFetchWatchOrder_OutputShape(t *testing.T) {
|
||||
server := testServer(testHTMLWithMetadata())
|
||||
defer server.Close()
|
||||
|
||||
url := server.URL + "/?/tools/watch_order/id/442"
|
||||
result, err := FetchWatchOrder(context.Background(), &http.Client{Timeout: time.Second}, url)
|
||||
if err != nil {
|
||||
t.Fatalf("expected no error, got %v", err)
|
||||
}
|
||||
|
||||
if result.ID != 442 {
|
||||
t.Fatalf("expected root id 442, got %d", result.ID)
|
||||
}
|
||||
|
||||
if len(result.WatchOrder) != 1 {
|
||||
t.Fatalf("expected 1 watch_order entry, got %d", len(result.WatchOrder))
|
||||
}
|
||||
|
||||
entry := result.WatchOrder[0]
|
||||
if entry.ID != 442 {
|
||||
t.Fatalf("expected entry id 442, got %d", entry.ID)
|
||||
}
|
||||
if entry.Type != "Movie" {
|
||||
t.Fatalf("expected type Movie, got %q", entry.Type)
|
||||
}
|
||||
if entry.Title != "Naruto Movie 1" {
|
||||
t.Fatalf("expected title Naruto Movie 1, got %q", entry.Title)
|
||||
}
|
||||
if entry.TitleAlt != "Naruto the Movie 1" {
|
||||
t.Fatalf("expected title_alt Naruto the Movie 1, got %q", entry.TitleAlt)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFetchWatchOrder_NoRowsReturnsEmpty(t *testing.T) {
|
||||
server := testServer(testHTMLEmptyRows())
|
||||
defer server.Close()
|
||||
|
||||
url := server.URL + "/?/tools/watch_order/id/1535"
|
||||
result, err := FetchWatchOrder(context.Background(), &http.Client{Timeout: time.Second}, url)
|
||||
if err != nil {
|
||||
t.Fatalf("expected no error, got %v", err)
|
||||
}
|
||||
|
||||
if result.ID != 1535 {
|
||||
t.Fatalf("expected root id 1535, got %d", result.ID)
|
||||
}
|
||||
|
||||
if len(result.WatchOrder) != 0 {
|
||||
t.Fatalf("expected no entries, got %d", len(result.WatchOrder))
|
||||
}
|
||||
}
|
||||
|
||||
func TestFetchWatchOrder_MissingMarkupFallsBackToProxy(t *testing.T) {
|
||||
proxyPayload := `Title: Jujutsu Kaisen / Watch Order
|
||||
URL Source: https://chiaki.site/?/tools/watch_order/id/40748
|
||||
|
||||
Markdown Content:
|
||||
Jujutsu Kaisen
|
||||
|
||||
Oct 3, 2020 – Mar 27, 2021 | TV | 24ep × 23min. | ★8.51 | [](https://myanimelist.net/anime/40748)
|
||||
Jujutsu Kaisen 0 Movie
|
||||
|
||||
Jujutsu Kaisen 0
|
||||
|
||||
Dec 24, 2021 | Movie | 1ep × 1hr. 44min. | ★8.36 | [](https://myanimelist.net/anime/48561)
|
||||
`
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if strings.HasPrefix(r.URL.Path, "/http/") {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(proxyPayload))
|
||||
return
|
||||
}
|
||||
|
||||
w.WriteHeader(http.StatusForbidden)
|
||||
_, _ = w.Write([]byte("blocked"))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
transport := http.DefaultTransport
|
||||
testClient := &http.Client{
|
||||
Timeout: time.Second,
|
||||
Transport: roundTripFunc(func(request *http.Request) (*http.Response, error) {
|
||||
if strings.HasPrefix(request.URL.Host, "r.jina.ai") {
|
||||
proxyURL := server.URL + "/http/" + strings.TrimPrefix(request.URL.Path, "/")
|
||||
proxyRequest, err := http.NewRequestWithContext(request.Context(), request.Method, proxyURL, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return transport.RoundTrip(proxyRequest)
|
||||
}
|
||||
|
||||
blockedURL := server.URL + request.URL.Path
|
||||
blockedRequest, err := http.NewRequestWithContext(request.Context(), request.Method, blockedURL, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return transport.RoundTrip(blockedRequest)
|
||||
}),
|
||||
}
|
||||
|
||||
result, err := FetchWatchOrder(context.Background(), testClient, "https://chiaki.site/?/tools/watch_order/id/40748")
|
||||
if err != nil {
|
||||
t.Fatalf("expected no error, got %v", err)
|
||||
}
|
||||
|
||||
if len(result.WatchOrder) != 2 {
|
||||
t.Fatalf("expected 2 proxy entries, got %d", len(result.WatchOrder))
|
||||
}
|
||||
|
||||
if result.WatchOrder[0].ID != 40748 || result.WatchOrder[0].Type != "TV" {
|
||||
t.Fatalf("unexpected first entry: %+v", result.WatchOrder[0])
|
||||
}
|
||||
|
||||
if result.WatchOrder[1].ID != 48561 || result.WatchOrder[1].Type != "Movie" {
|
||||
t.Fatalf("unexpected second entry: %+v", result.WatchOrder[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestFetchWatchOrder_HTTPStatusErrorIncludesContext(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Server", "cloudflare")
|
||||
w.Header().Set("CF-Ray", "abc123")
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
w.WriteHeader(http.StatusForbidden)
|
||||
_, _ = w.Write([]byte("<html><body>access denied</body></html>"))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
url := server.URL + "/?/tools/watch_order/id/1"
|
||||
_, err := fetchDocument(context.Background(), &http.Client{Timeout: time.Second}, url)
|
||||
if err == nil {
|
||||
t.Fatalf("expected error, got nil")
|
||||
}
|
||||
|
||||
var statusError *HTTPStatusError
|
||||
if !errors.As(err, &statusError) {
|
||||
t.Fatalf("expected HTTPStatusError, got %T", err)
|
||||
}
|
||||
|
||||
if statusError.StatusCode != http.StatusForbidden {
|
||||
t.Fatalf("expected 403, got %d", statusError.StatusCode)
|
||||
}
|
||||
if statusError.CFRay != "abc123" {
|
||||
t.Fatalf("expected cf-ray abc123, got %q", statusError.CFRay)
|
||||
}
|
||||
if !strings.Contains(statusError.BodyPreview, "access denied") {
|
||||
t.Fatalf("expected body preview to include access denied, got %q", statusError.BodyPreview)
|
||||
}
|
||||
}
|
||||
|
||||
type roundTripFunc func(*http.Request) (*http.Response, error)
|
||||
|
||||
func (f roundTripFunc) RoundTrip(request *http.Request) (*http.Response, error) {
|
||||
return f(request)
|
||||
}
|
||||
Reference in New Issue
Block a user