Podcast sync
This commit is contained in:
105
app/resolver.py
Normal file
105
app/resolver.py
Normal file
@@ -0,0 +1,105 @@
|
||||
# resolver.py
|
||||
from __future__ import annotations
|
||||
import json, os, re, subprocess
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional, Dict, Any, Tuple, List
|
||||
|
||||
try:
|
||||
from rapidfuzz import fuzz, process
|
||||
except Exception:
|
||||
fuzz = None
|
||||
process = None
|
||||
|
||||
def _norm(s: str) -> str:
|
||||
s = s.lower()
|
||||
s = re.sub(r"[\[\]\(\)\{\}|_]+", " ", s)
|
||||
s = re.sub(r"[^0-9a-zá-žà-ÿ\u00C0-\u024F\s]+", " ", s) # keep latin accents, cz/diacritics
|
||||
s = re.sub(r"\s+", " ", s).strip()
|
||||
return s
|
||||
|
||||
def _title_from_filename(p: Path) -> str:
|
||||
name = p.stem # drop extension
|
||||
# common yt-dlp patterns like "YYYYMMDD - Title"
|
||||
name = re.sub(r"^\d{8}\s*-\s*", "", name)
|
||||
return name
|
||||
|
||||
def _ffprobe_duration_seconds(p: Path) -> Optional[int]:
|
||||
try:
|
||||
out = subprocess.check_output([
|
||||
"ffprobe", "-v", "error", "-show_entries", "format=duration",
|
||||
"-of", "default=nw=1:nk=1", str(p)
|
||||
], stderr=subprocess.STDOUT, text=True).strip()
|
||||
return int(float(out))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def load_index(index_path: Path) -> List[Dict[str, Any]]:
|
||||
if not index_path.exists():
|
||||
return []
|
||||
with index_path.open("r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
# expected per-item keys:
|
||||
# title, pubdate_ts (int), duration_s (int or null),
|
||||
# transcript_urls: {"srt": str|None, "vtt": str|None, "txt": str|None},
|
||||
# audio_url, guid, feed_url
|
||||
return data if isinstance(data, list) else []
|
||||
|
||||
def match_episode(
|
||||
media_path: Path,
|
||||
index_items: List[Dict[str, Any]],
|
||||
duration_tolerance_s: int = 120,
|
||||
min_ratio: int = 82,
|
||||
date_window_days: int = 14,
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
title_guess = _title_from_filename(media_path)
|
||||
tnorm = _norm(title_guess)
|
||||
if not tnorm:
|
||||
return None
|
||||
|
||||
media_secs = _ffprobe_duration_seconds(media_path)
|
||||
media_date = None
|
||||
# try to parse upload date prefix in filename if present
|
||||
m = re.search(r"(\d{8})", media_path.stem)
|
||||
if m:
|
||||
try:
|
||||
media_date = datetime.strptime(m.group(1), "%Y%m%d").replace(tzinfo=timezone.utc)
|
||||
except Exception:
|
||||
media_date = None
|
||||
|
||||
candidates = []
|
||||
for item in index_items:
|
||||
item_title = _norm(item.get("title", ""))
|
||||
if not item_title:
|
||||
continue
|
||||
ratio = (fuzz.token_sort_ratio(tnorm, item_title) if fuzz else (100 if tnorm == item_title else 0))
|
||||
if ratio < min_ratio:
|
||||
continue
|
||||
|
||||
# duration filter (if both known)
|
||||
ok_duration = True
|
||||
if media_secs and item.get("duration_s"):
|
||||
ok_duration = abs(media_secs - int(item["duration_s"])) <= duration_tolerance_s
|
||||
|
||||
# date window (if both known)
|
||||
ok_date = True
|
||||
if media_date and item.get("pubdate_ts"):
|
||||
dt_item = datetime.fromtimestamp(int(item["pubdate_ts"]), tz=timezone.utc)
|
||||
delta_days = abs((media_date - dt_item).days)
|
||||
ok_date = delta_days <= date_window_days
|
||||
|
||||
if ok_duration and ok_date:
|
||||
candidates.append((ratio, item))
|
||||
|
||||
if not candidates:
|
||||
return None
|
||||
candidates.sort(key=lambda x: x[0], reverse=True)
|
||||
return candidates[0][1]
|
||||
|
||||
def choose_transcript_url(item: Dict[str, Any]) -> Optional[Tuple[str, str]]:
|
||||
urls = item.get("transcript_urls") or {}
|
||||
# prefer text/plain, then VTT, then SRT:
|
||||
if urls.get("txt"): return (urls["txt"], "txt")
|
||||
if urls.get("vtt"): return (urls["vtt"], "vtt")
|
||||
if urls.get("srt"): return (urls["srt"], "srt")
|
||||
return None
|
Reference in New Issue
Block a user