From c725d960356cc162e3192ccd7631f228684dc646 Mon Sep 17 00:00:00 2001 From: mkelvers Date: Thu, 4 Jun 2026 16:09:53 +0200 Subject: [PATCH] docs: add recommendation architecture document --- docs/recommendation-architecture.md | 179 ++++++++++++++++++ .../024_add_recommendation_foundation.sql | 62 ++++++ 2 files changed, 241 insertions(+) create mode 100644 docs/recommendation-architecture.md create mode 100644 internal/database/migrations/024_add_recommendation_foundation.sql diff --git a/docs/recommendation-architecture.md b/docs/recommendation-architecture.md new file mode 100644 index 0000000..7abc012 --- /dev/null +++ b/docs/recommendation-architecture.md @@ -0,0 +1,179 @@ +# Recommendation Architecture + +This document defines the long-term shape of the `For You` discovery system. +The goal is to keep the current implementation simple enough to operate inside +the existing Go application while preserving a clean path toward a larger +recommender system. + +## Current Serving Model + +The current `For You` implementation is a bounded hybrid ranker: + +- builds weighted seeds from user watch history +- uses Jikan recommendation edges as collaborative candidates +- excludes anime already present in the watchlist +- boosts candidates that match user taste signals +- reranks the final list to reduce genre pileups + +The online request path stays intentionally small: + +1. load recent watchlist state +2. derive strong seeds +3. fetch bounded candidate set +4. score candidates +5. rerank for diversity +6. return top results + +## Target System Shape + +The future recommender should keep four stable layers: + +1. event collection +2. feature aggregation +3. candidate generation +4. ranking and reranking + +That separation matters more than the specific model used at each stage. + +## Event Collection + +Recommendations should eventually be driven by behavior events, not only by +watchlist state. + +Important events: + +- `impression` +- `click` +- `add_to_watchlist` +- `start_watch` +- `progress_update` +- `complete` +- `drop` +- `hide_recommendation` +- `search` + +Event capture should preserve: + +- `user_id` +- `anime_id` +- `event_type` +- `occurred_at` +- `source` +- contextual metadata as JSON + +## Feature Aggregation + +Online requests should not recompute the full user profile from raw events. +Instead, background jobs should maintain aggregated feature snapshots. + +Useful profile features: + +- genre affinity +- theme affinity +- studio affinity +- demographic affinity +- completion rate by genre +- abandonment rate by genre +- preference for airing vs finished anime +- preference for recent vs older anime +- short-term interest profile +- long-term stable taste profile + +These features should eventually live in a durable profile snapshot table so +the serving path remains cheap. + +## Candidate Generation + +Candidate generation should be modular. Each source should produce: + +- `anime_id` +- `source` +- `source_score` +- explanation metadata + +Primary candidate sources: + +- item-item recommendation edges +- related anime and sequel chains +- content-similar anime from genres, themes, studios, and demographics +- trending titles inside the user taste envelope +- seasonal titles aligned with recent behavior +- editorial or promoted rails when needed + +Candidate generation should stay bounded. Ranking the full catalog online is +not a viable long-term approach. + +## Ranking + +The current ranker is heuristic by design. That is the correct starting point. + +Near-term ranking inputs: + +- collaborative recommendation weight +- watch history status weight +- recency decay +- progress-based engagement +- genre overlap +- theme overlap +- studio overlap +- demographic overlap +- airing or freshness alignment +- popularity moderation + +The ranking API should remain stable even if the scoring model changes later. +That allows a future move to gradient-boosted trees or other learned rankers +without rewriting candidate generation or serving. + +## Reranking + +The final serving stage should apply product constraints that raw ranking will +not handle well on its own: + +- genre diversity +- franchise caps +- duplicate suppression +- hide or negative-feedback suppression +- maturity filtering +- freshness and exploration budget + +This is intentionally a separate concern from relevance scoring. + +## Data Tables + +The first recommendation-specific schema additions should support: + +- append-only event capture +- recommendation impression tracking +- cached user profile snapshots + +These tables are created in migration `024_add_recommendation_foundation.sql`. + +## Roadmap + +### V1 + +- bounded hybrid ranker in request path +- uses watchlist history and Jikan metadata +- no offline jobs required + +### V2 + +- capture user recommendation and watch behavior events +- persist user profile snapshots +- precompute candidate caches +- add explicit feedback controls such as hide or not interested + +### V3 + +- split retrieval from ranking +- precompute similarity graphs and user candidate pools +- run offline evaluation on impressions, clicks, starts, and completes +- introduce learned ranking only when enough behavior data exists + +## Operational Rules + +- keep request-time fanout bounded +- keep scoring explainable +- log recommendation impressions before introducing heavier models +- prefer replaceable modules over one large recommendation function +- treat data collection as the foundation for later ML, not an optional extra diff --git a/internal/database/migrations/024_add_recommendation_foundation.sql b/internal/database/migrations/024_add_recommendation_foundation.sql new file mode 100644 index 0000000..22ff5d8 --- /dev/null +++ b/internal/database/migrations/024_add_recommendation_foundation.sql @@ -0,0 +1,62 @@ +-- +goose Up +CREATE TABLE IF NOT EXISTS recommendation_event ( + id TEXT PRIMARY KEY, + user_id TEXT NOT NULL, + anime_id INTEGER, + event_type TEXT NOT NULL, + source TEXT, + metadata_json TEXT, + occurred_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY(user_id) REFERENCES user(id) ON DELETE CASCADE, + FOREIGN KEY(anime_id) REFERENCES anime(id) ON DELETE SET NULL +); + +CREATE INDEX IF NOT EXISTS idx_recommendation_event_user_occurred_at +ON recommendation_event(user_id, occurred_at DESC); + +CREATE INDEX IF NOT EXISTS idx_recommendation_event_user_event_type_occurred_at +ON recommendation_event(user_id, event_type, occurred_at DESC); + +CREATE INDEX IF NOT EXISTS idx_recommendation_event_anime_occurred_at +ON recommendation_event(anime_id, occurred_at DESC); + +CREATE TABLE IF NOT EXISTS recommendation_impression ( + id TEXT PRIMARY KEY, + user_id TEXT NOT NULL, + anime_id INTEGER NOT NULL, + rail TEXT NOT NULL, + position INTEGER NOT NULL, + request_id TEXT, + metadata_json TEXT, + occurred_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY(user_id) REFERENCES user(id) ON DELETE CASCADE, + FOREIGN KEY(anime_id) REFERENCES anime(id) ON DELETE CASCADE +); + +CREATE INDEX IF NOT EXISTS idx_recommendation_impression_user_occurred_at +ON recommendation_impression(user_id, occurred_at DESC); + +CREATE INDEX IF NOT EXISTS idx_recommendation_impression_request_id +ON recommendation_impression(request_id); + +CREATE TABLE IF NOT EXISTS recommendation_profile_snapshot ( + user_id TEXT PRIMARY KEY, + profile_json TEXT NOT NULL, + source_window_start DATETIME, + source_window_end DATETIME, + computed_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY(user_id) REFERENCES user(id) ON DELETE CASCADE +); + +-- +goose Down +DROP TABLE IF EXISTS recommendation_profile_snapshot; +DROP INDEX IF EXISTS idx_recommendation_impression_request_id; +DROP INDEX IF EXISTS idx_recommendation_impression_user_occurred_at; +DROP TABLE IF EXISTS recommendation_impression; +DROP INDEX IF EXISTS idx_recommendation_event_anime_occurred_at; +DROP INDEX IF EXISTS idx_recommendation_event_user_event_type_occurred_at; +DROP INDEX IF EXISTS idx_recommendation_event_user_occurred_at; +DROP TABLE IF EXISTS recommendation_event;