vynl/backend/app/services/bandcamp.py

import re
from difflib import SequenceMatcher

import httpx


AUTOCOMPLETE_URL = "https://bandcamp.com/api/fuzzysearch/2/autocomplete"
SEARCH_URL = "https://bandcamp.com/search"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0",
}


def _normalize(s: str) -> str:
    """Normalize string for comparison."""
    return re.sub(r'[^a-z0-9\s]', '', s.lower()).strip()


def _similarity(a: str, b: str) -> float:
    """Return similarity ratio between two strings."""
    return SequenceMatcher(None, _normalize(a), _normalize(b)).ratio()


async def search_bandcamp_verified(artist: str, title: str) -> dict | None:
    """Search Bandcamp and only return a result if the artist actually matches.

    Returns the best matching result or None if no good match found.
    First tries artist+song, then falls back to artist-only search.
    """
    # Try track search first: "artist title"
    results = await search_bandcamp(f"{artist} {title}", item_type="t")
    for r in results:
        artist_sim = _similarity(r.get("artist", ""), artist)
        title_sim = _similarity(r.get("title", ""), title)
        # Require artist to be a strong match (>0.75) AND title reasonable (>0.5)
        if artist_sim >= 0.75 and title_sim >= 0.5:
            return r

    # Try artist/band search as fallback — return their artist page URL
    results = await search_bandcamp(artist, item_type="b")
    for r in results:
        # For band results, title IS the band name
        name = r.get("title", "") or r.get("artist", "")
        if _similarity(name, artist) >= 0.7:
            return r

    return None


async def search_bandcamp(query: str, item_type: str = "t") -> list[dict]:
    """Search Bandcamp for tracks, albums, or artists.

    item_type: 't' for tracks, 'a' for albums, 'b' for bands/artists.
    """
    # Try autocomplete API first
    results = await _search_autocomplete(query, item_type)
    if results:
        return results

    # Fall back to HTML scraping
    return await _search_html(query, item_type)


async def _search_autocomplete(query: str, item_type: str) -> list[dict]:
    """Try the undocumented Bandcamp autocomplete API."""
    try:
        async with httpx.AsyncClient(timeout=10, headers=HEADERS) as client:
            resp = await client.get(AUTOCOMPLETE_URL, params={"q": query})

        if resp.status_code != 200:
            return []

        data = resp.json()
        results = []

        # The autocomplete API returns results grouped by type
        auto_results = data.get("results", [])
        for item in auto_results:
            result_type = item.get("type", "")

            # Map autocomplete types to our item_type filter
            if item_type == "t" and result_type != "t":
                continue
            if item_type == "a" and result_type != "a":
                continue
            if item_type == "b" and result_type != "b":
                continue

            results.append({
                "title": item.get("name", ""),
                "artist": item.get("band_name", ""),
                "art_url": item.get("img", item.get("art_id", None)),
                "bandcamp_url": item.get("url", ""),
                "item_type": result_type,
            })

        return results[:20]
    except Exception:
        return []


async def _search_html(query: str, item_type: str) -> list[dict]:
    """Fall back to scraping Bandcamp search results HTML."""
    params = {"q": query, "item_type": item_type}
    try:
        async with httpx.AsyncClient(timeout=15, headers=HEADERS, follow_redirects=True) as client:
            resp = await client.get(SEARCH_URL, params=params)

        if resp.status_code != 200:
            return []

        html = resp.text
        results = []

        # Split by search result items
        items = re.split(r'<li\s+class="searchresult\s', html)
        for item_html in items[1:]:  # skip first split (before first result)
            # Extract title and URL from heading link
            heading_match = re.search(
                r'class="heading">\s*<a\s+href="([^"]+)"[^>]*>\s*([^<]+)',
                item_html,
            )
            if not heading_match:
                continue

            url = heading_match.group(1).strip()
            title = heading_match.group(2).strip()

            # Extract artist/subhead info
            subhead_match = re.search(
                r'class="subhead">\s*([^<]+)', item_html
            )
            artist = ""
            if subhead_match:
                subhead = subhead_match.group(1).strip()
                # Subhead format varies: "by Artist" or "from Album by Artist"
                by_match = re.search(r'by\s+(.+)', subhead)
                if by_match:
                    artist = by_match.group(1).strip()
                else:
                    artist = subhead

            # Extract album art URL
            art_match = re.search(
                r'class="art">\s*<img\s+src="([^"]+)"', item_html
            )
            art_url = art_match.group(1).strip() if art_match else None

            results.append({
                "title": title,
                "artist": artist,
                "art_url": art_url,
                "bandcamp_url": url,
                "item_type": item_type,
            })

            if len(results) >= 20:
                break

        return results
    except Exception:
        return []


async def get_embed_data(bandcamp_url: str) -> dict | None:
    """Get embed info for a Bandcamp URL.

    Fetches the page HTML, extracts the track/album ID, and returns
    the embed iframe URL along with metadata.
    """
    try:
        async with httpx.AsyncClient(timeout=15, headers=HEADERS, follow_redirects=True) as client:
            resp = await client.get(bandcamp_url)

        if resp.status_code != 200:
            return None

        html = resp.text

        # Determine if this is a track or album URL
        is_track = "/track/" in bandcamp_url

        # Try to extract the ID from meta tags or data attributes
        # Look for: <meta property="og:video" content="...album=12345..." />
        # or data-tralbum-id="12345"
        item_id = None

        tralbum_match = re.search(r'data-tralbum-id="(\d+)"', html)
        if tralbum_match:
            item_id = tralbum_match.group(1)

        if not item_id:
            # Try og:video meta tag which contains embed URL with ID
            og_match = re.search(
                r'<meta\s+property="og:video"\s+content="[^"]*(?:album|track)=(\d+)',
                html,
            )
            if og_match:
                item_id = og_match.group(1)

        if not item_id:
            # Try the embedded player link in the page
            embed_match = re.search(
                r'EmbeddedPlayer/(?:album|track)=(\d+)', html
            )
            if embed_match:
                item_id = embed_match.group(1)

        if not item_id:
            return None

        # Build embed URL
        id_type = "track" if is_track else "album"
        embed_url = (
            f"https://bandcamp.com/EmbeddedPlayer/"
            f"{id_type}={item_id}/size=large/"
            f"bgcol=1C1917/linkcol=7C3AED/"
            f"tracklist=false/transparent=true/"
        )

        # Extract title from og:title
        title = ""
        title_match = re.search(
            r'<meta\s+property="og:title"\s+content="([^"]+)"', html
        )
        if title_match:
            title = title_match.group(1).strip()

        # Extract artist
        artist = ""
        artist_match = re.search(
            r'<meta\s+property="og:site_name"\s+content="([^"]+)"', html
        )
        if artist_match:
            artist = artist_match.group(1).strip()

        # Extract art
        art_url = None
        art_match = re.search(
            r'<meta\s+property="og:image"\s+content="([^"]+)"', html
        )
        if art_match:
            art_url = art_match.group(1).strip()

        return {
            "embed_url": embed_url,
            "title": title,
            "artist": artist,
            "art_url": art_url,
            "item_id": item_id,
            "item_type": id_type,
        }
    except Exception:
        return None