Add Bandcamp discovery via public API (no scraping) - browse new releases by genre tag

2026-03-31 09:58:28 -05:00
parent be30a47bbb
commit 152f217675
7 changed files with 295 additions and 301 deletions
--- a/backend/app/services/bandcamp.py
+++ b/backend/app/services/bandcamp.py
@@ -1,255 +1,73 @@
-import re
-from difflib import SequenceMatcher
+"""Bandcamp discovery using their public APIs (no scraping)."""

 import httpx

-
-AUTOCOMPLETE_URL = "https://bandcamp.com/api/fuzzysearch/2/autocomplete"
-SEARCH_URL = "https://bandcamp.com/search"
-
 HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0",
 }

-
-def _normalize(s: str) -> str:
-    """Normalize string for comparison."""
-    return re.sub(r'[^a-z0-9\s]', '', s.lower()).strip()
+DIG_DEEPER_URL = "https://bandcamp.com/api/hub/2/dig_deeper"


-def _similarity(a: str, b: str) -> float:
-    """Return similarity ratio between two strings."""
-    return SequenceMatcher(None, _normalize(a), _normalize(b)).ratio()
+async def discover_by_tag(
+    tags: list[str],
+    sort: str = "new",
+    page: int = 1,
+) -> list[dict]:
+    """Discover new music on Bandcamp by tag using their public API.

+    Args:
+        tags: List of genre/tag strings (e.g. ["indie-rock", "shoegaze"])
+        sort: "new", "rec", or "pop" (new releases, recommended, popular)
+        page: Page number for pagination

-async def search_bandcamp_verified(artist: str, title: str) -> dict | None:
-    """Search Bandcamp and only return a result if the artist actually matches.
-
-    Returns the best matching result or None if no good match found.
-    First tries artist+song, then falls back to artist-only search.
+    Returns list of releases with: title, artist, art_url, bandcamp_url, genre, item_type
    """
-    # Try track search first: "artist title"
-    results = await search_bandcamp(f"{artist} {title}", item_type="t")
-    for r in results:
-        artist_sim = _similarity(r.get("artist", ""), artist)
-        title_sim = _similarity(r.get("title", ""), title)
-        # Require artist to be a strong match (>0.75) AND title reasonable (>0.5)
-        if artist_sim >= 0.75 and title_sim >= 0.5:
-            return r
+    async with httpx.AsyncClient(timeout=15, headers=HEADERS) as client:
+        resp = await client.post(
+            DIG_DEEPER_URL,
+            json={
+                "filters": {
+                    "format": "all",
+                    "location": 0,
+                    "sort": sort,
+                    "tags": tags,
+                },
+                "page": page,
+            },
+        )

-    # Try artist/band search as fallback — return their artist page URL
-    results = await search_bandcamp(artist, item_type="b")
-    for r in results:
-        # For band results, title IS the band name
-        name = r.get("title", "") or r.get("artist", "")
-        if _similarity(name, artist) >= 0.7:
-            return r
-
-    return None
-
-
-async def search_bandcamp(query: str, item_type: str = "t") -> list[dict]:
-    """Search Bandcamp for tracks, albums, or artists.
-
-    item_type: 't' for tracks, 'a' for albums, 'b' for bands/artists.
-    """
-    # Try autocomplete API first
-    results = await _search_autocomplete(query, item_type)
-    if results:
-        return results
-
-    # Fall back to HTML scraping
-    return await _search_html(query, item_type)
-
-
-async def _search_autocomplete(query: str, item_type: str) -> list[dict]:
-    """Try the undocumented Bandcamp autocomplete API."""
-    try:
-        async with httpx.AsyncClient(timeout=10, headers=HEADERS) as client:
-            resp = await client.get(AUTOCOMPLETE_URL, params={"q": query})
-
-        if resp.status_code != 200:
-            return []
-
-        data = resp.json()
-        results = []
-
-        # The autocomplete API returns results grouped by type
-        auto_results = data.get("results", [])
-        for item in auto_results:
-            result_type = item.get("type", "")
-
-            # Map autocomplete types to our item_type filter
-            if item_type == "t" and result_type != "t":
-                continue
-            if item_type == "a" and result_type != "a":
-                continue
-            if item_type == "b" and result_type != "b":
-                continue
-
-            results.append({
-                "title": item.get("name", ""),
-                "artist": item.get("band_name", ""),
-                "art_url": item.get("img", item.get("art_id", None)),
-                "bandcamp_url": item.get("url", ""),
-                "item_type": result_type,
-            })
-
-        return results[:20]
-    except Exception:
+    if resp.status_code != 200:
        return []

+    data = resp.json()
+    results = []

-async def _search_html(query: str, item_type: str) -> list[dict]:
-    """Fall back to scraping Bandcamp search results HTML."""
-    params = {"q": query, "item_type": item_type}
-    try:
-        async with httpx.AsyncClient(timeout=15, headers=HEADERS, follow_redirects=True) as client:
-            resp = await client.get(SEARCH_URL, params=params)
+    for item in data.get("items", []):
+        art_id = item.get("art_id")
+        art_url = f"https://f4.bcbits.com/img/a{art_id}_16.jpg" if art_id else None

-        if resp.status_code != 200:
-            return []
+        tralbum_type = item.get("tralbum_type", "a")
+        type_path = "album" if tralbum_type == "a" else "track"
+        item_url = item.get("tralbum_url", "")

-        html = resp.text
-        results = []
-
-        # Split by search result items
-        items = re.split(r'<li\s+class="searchresult\s', html)
-        for item_html in items[1:]:  # skip first split (before first result)
-            # Extract title and URL from heading link
-            heading_match = re.search(
-                r'class="heading">\s*<a\s+href="([^"]+)"[^>]*>\s*([^<]+)',
-                item_html,
-            )
-            if not heading_match:
-                continue
-
-            url = heading_match.group(1).strip()
-            title = heading_match.group(2).strip()
-
-            # Extract artist/subhead info
-            subhead_match = re.search(
-                r'class="subhead">\s*([^<]+)', item_html
-            )
-            artist = ""
-            if subhead_match:
-                subhead = subhead_match.group(1).strip()
-                # Subhead format varies: "by Artist" or "from Album by Artist"
-                by_match = re.search(r'by\s+(.+)', subhead)
-                if by_match:
-                    artist = by_match.group(1).strip()
-                else:
-                    artist = subhead
-
-            # Extract album art URL
-            art_match = re.search(
-                r'class="art">\s*<img\s+src="([^"]+)"', item_html
-            )
-            art_url = art_match.group(1).strip() if art_match else None
-
-            results.append({
-                "title": title,
-                "artist": artist,
-                "art_url": art_url,
-                "bandcamp_url": url,
-                "item_type": item_type,
-            })
-
-            if len(results) >= 20:
-                break
-
-        return results
-    except Exception:
-        return []
-
-
-async def get_embed_data(bandcamp_url: str) -> dict | None:
-    """Get embed info for a Bandcamp URL.
-
-    Fetches the page HTML, extracts the track/album ID, and returns
-    the embed iframe URL along with metadata.
-    """
-    try:
-        async with httpx.AsyncClient(timeout=15, headers=HEADERS, follow_redirects=True) as client:
-            resp = await client.get(bandcamp_url)
-
-        if resp.status_code != 200:
-            return None
-
-        html = resp.text
-
-        # Determine if this is a track or album URL
-        is_track = "/track/" in bandcamp_url
-
-        # Try to extract the ID from meta tags or data attributes
-        # Look for: <meta property="og:video" content="...album=12345..." />
-        # or data-tralbum-id="12345"
-        item_id = None
-
-        tralbum_match = re.search(r'data-tralbum-id="(\d+)"', html)
-        if tralbum_match:
-            item_id = tralbum_match.group(1)
-
-        if not item_id:
-            # Try og:video meta tag which contains embed URL with ID
-            og_match = re.search(
-                r'<meta\s+property="og:video"\s+content="[^"]*(?:album|track)=(\d+)',
-                html,
-            )
-            if og_match:
-                item_id = og_match.group(1)
-
-        if not item_id:
-            # Try the embedded player link in the page
-            embed_match = re.search(
-                r'EmbeddedPlayer/(?:album|track)=(\d+)', html
-            )
-            if embed_match:
-                item_id = embed_match.group(1)
-
-        if not item_id:
-            return None
-
-        # Build embed URL
-        id_type = "track" if is_track else "album"
-        embed_url = (
-            f"https://bandcamp.com/EmbeddedPlayer/"
-            f"{id_type}={item_id}/size=large/"
-            f"bgcol=1C1917/linkcol=7C3AED/"
-            f"tracklist=false/transparent=true/"
-        )
-
-        # Extract title from og:title
-        title = ""
-        title_match = re.search(
-            r'<meta\s+property="og:title"\s+content="([^"]+)"', html
-        )
-        if title_match:
-            title = title_match.group(1).strip()
-
-        # Extract artist
-        artist = ""
-        artist_match = re.search(
-            r'<meta\s+property="og:site_name"\s+content="([^"]+)"', html
-        )
-        if artist_match:
-            artist = artist_match.group(1).strip()
-
-        # Extract art
-        art_url = None
-        art_match = re.search(
-            r'<meta\s+property="og:image"\s+content="([^"]+)"', html
-        )
-        if art_match:
-            art_url = art_match.group(1).strip()
-
-        return {
-            "embed_url": embed_url,
-            "title": title,
-            "artist": artist,
+        results.append({
+            "title": item.get("title", ""),
+            "artist": item.get("artist", ""),
            "art_url": art_url,
-            "item_id": item_id,
-            "item_type": id_type,
-        }
-    except Exception:
-        return None
+            "bandcamp_url": item_url,
+            "genre": ", ".join(tags),
+            "item_type": type_path,
+        })
+
+    return results
+
+
+async def get_trending_tags() -> list[str]:
+    """Return common Bandcamp genre tags for discovery."""
+    return [
+        "indie-rock", "electronic", "hip-hop-rap", "ambient", "punk",
+        "experimental", "folk", "jazz", "metal", "pop", "r-b-soul",
+        "shoegaze", "post-punk", "synthwave", "lo-fi", "dream-pop",
+        "indie-pop", "psychedelic", "garage-rock", "emo",
+    ]