diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..3db3a4c --- /dev/null +++ b/.env.example @@ -0,0 +1,10 @@ +# Copy this file to .env and fill in secrets + +# Meilisearch keys +MEILI_MASTER_KEY=change_me_to_strong_random +MEILI_KEY=${MEILI_MASTER_KEY} + +# OpenWebUI integration +OPENWEBUI_URL=http://host.docker.internal:3003 +OPENWEBUI_API_KEY=put_your_openwebui_api_key_here +OPENWEBUI_KB_NAME=Homelab Library \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4dec629 --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +# Local env and secrets +.env + +# Runtime data +data/ +models/ +library/ +transcripts/ +tmp/ + +# Python cruft +__pycache__/ +*.pyc + +# Misc +.DS_Store \ No newline at end of file diff --git a/README.md b/README.md index 29c5843..63e98d7 100644 --- a/README.md +++ b/README.md @@ -1,43 +1,33 @@ -# PodX - Offline Podcast + Docs Unified Search (Docker) +# PodX - Offline Library with OpenWebUI export + +## Repo-friendly secrets +- Secrets live in **.env** at the repo root (NOT committed). +- Commit **.env.example**. Users copy it to `.env` and fill in their values. +- We also include **.gitignore** to keep `.env` and data paths out of git. ## Quick start ```bash -unzip homelab-podx.zip && cd homelab-podx +cp .env.example .env # edit values (MEILI_MASTER_KEY, OPENWEBUI_API_KEY, etc.) docker compose up -d --build -# Open the UI: -# http://:8088 +# UI: http://:8088 +# Meili: http://:7700 ``` +The worker reaches OpenWebUI at `$OPENWEBUI_URL` (default: http://host.docker.internal:3003). -Paste links to podcasts/video (YouTube, Rumble, direct MP3). Worker downloads with yt-dlp, transcribes locally with faster-whisper large-v3, stores media under library/, subtitles and transcripts under transcripts/, and indexes everything (plus your PDFs/EPUBs/Kiwix) in a single Meilisearch index: library. - -### Ingest PDFs +## Ingest helpers ```bash -MEILI_URL=http://localhost:7700 MEILI_KEY=devkey ./ingest/ingest_pdfs.sh /path/to/*.pdf +MEILI_URL=http://localhost:7700 MEILI_KEY=$MEILI_MASTER_KEY ./ingest/ingest_pdfs.sh /path/*.pdf +MEILI_URL=http://localhost:7700 MEILI_KEY=$MEILI_MASTER_KEY ./ingest/ingest_epub.py /path/*.epub +MEILI_URL=http://localhost:7700 MEILI_KEY=$MEILI_MASTER_KEY ./ingest/ingest_kiwix.sh /path/wiki.zim ``` -### Ingest EPUBs + +## Backfill existing files into OpenWebUI ```bash -pip install ebooklib beautifulsoup4 lxml requests -MEILI_URL=http://localhost:7700 MEILI_KEY=devkey ./ingest/ingest_epub.py /path/to/*.epub +# From repo root: +./tools/backfill_openwebui.sh +# Or include extra folders to scan: +./tools/backfill_openwebui.sh /some/other/folder /another/folder ``` - -### Ingest Kiwix ZIM -Install zim-tools: apt-get install zim-tools or brew install zimtools -```bash -MEILI_URL=http://localhost:7700 MEILI_KEY=devkey ./ingest/ingest_kiwix.sh /path/to/wikipedia_en.zim -``` - -### Optional: set Meilisearch settings -```bash -curl -X PATCH "http://localhost:7700/indexes/library/settings" -H "Authorization: Bearer devkey" -H "Content-Type: application/json" -d '{"searchableAttributes":["title","text","meta.book_author","meta.uploader","meta.chapter"], - "displayedAttributes":["type","title","source","date","_formatted","meta"], - "sortableAttributes":["date","type"], - "filterableAttributes":["type"]}' -``` - -### Plex -Point Plex libraries at library/. Video podcasts will show .srt/.vtt subtitles automatically if basenames match. - -### Notes -- Accuracy-first: model large-v3 (English + Czech). Change via WHISPER_MODEL env var in docker-compose.yml if desired. -- Everything runs offline; no cloud calls. +- Reads `.env` for `OPENWEBUI_URL`, `OPENWEBUI_API_KEY`, `OPENWEBUI_KB_NAME`. +- Uploads `*.txt`, `*.md`, `*.html` it finds in `./transcripts` and `./library/web` by default. diff --git a/app/app.py b/app/app.py index 070a206..0b486ac 100644 --- a/app/app.py +++ b/app/app.py @@ -4,7 +4,7 @@ from redis import Redis from rq import Queue MEILI_URL = os.getenv("MEILI_URL", "http://meili:7700") -MEILI_KEY = os.getenv("MEILI_KEY", "") +MEILI_KEY = os.getenv("MEILI_KEY", "") # from .env REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0") app = Flask(__name__) @@ -26,8 +26,8 @@ mark{background: #fff2a8}

PodX

- - + +
Batch
@@ -36,7 +36,7 @@ mark{background: #fff2a8}
-

Unified search (podcasts + PDFs + EPUB + Kiwix)

+

Unified search (podcasts + PDFs + EPUB + Kiwix + Web)

diff --git a/app/requirements.txt b/app/requirements.txt index bfa0a24..ef901b0 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -6,3 +6,7 @@ faster-whisper==1.0.3 ffmpeg-python==0.2.0 requests==2.32.3 orjson==3.10.7 +trafilatura==1.6.3 +lxml==5.3.0 +html5lib==1.1 +tldextract==5.1.2 diff --git a/app/worker.py b/app/worker.py index 7a50366..5e04436 100644 --- a/app/worker.py +++ b/app/worker.py @@ -10,6 +10,10 @@ TMP = Path(os.getenv("TMP_ROOT", "/tmpdl")) MODEL_NAME = os.getenv("WHISPER_MODEL","large-v3") COMPUTE = os.getenv("WHISPER_PRECISION","int8") +OWUI_URL = os.getenv("OPENWEBUI_URL", "").rstrip("/") +OWUI_KEY = os.getenv("OPENWEBUI_API_KEY", "") +OWUI_KB = os.getenv("OPENWEBUI_KB_NAME", "Homelab Library") + TRN.mkdir(parents=True, exist_ok=True) LIB.mkdir(parents=True, exist_ok=True) TMP.mkdir(parents=True, exist_ok=True) @@ -17,9 +21,11 @@ TMP.mkdir(parents=True, exist_ok=True) model = WhisperModel(MODEL_NAME, compute_type=COMPUTE) def log(feed): - with open(TRN / "_feed.log", "a", encoding="utf-8") as f: - import orjson as _oj - f.write(_oj.dumps(feed).decode()+"\n") + try: + with open(TRN / "_feed.log", "a", encoding="utf-8") as f: + f.write(orjson.dumps(feed).decode()+"\n") + except Exception: + pass def sanitize(name): return re.sub(r'[\\/:"*?<>|]+', ' ', name).strip() @@ -42,15 +48,13 @@ def transcribe(media_path: Path): segments, info = model.transcribe(str(media_path), vad_filter=True, language="auto") title = media_path.stem base = TRN / title - segs = [] - text_parts = [] + segs, text_parts = [], [] for s in segments: segs.append({"start": s.start, "end": s.end, "text": s.text}) text_parts.append(s.text) txt = " ".join(text_parts).strip() - import orjson as _oj - open(base.with_suffix(".json"), "wb").write(_oj.dumps({"file": str(media_path), "language": info.language, "segments": segs})) + open(base.with_suffix(".json"), "wb").write(orjson.dumps({"file": str(media_path), "language": info.language, "segments": segs})) open(base.with_suffix(".txt"), "w", encoding="utf-8").write(txt) def fmt_ts(t): @@ -81,11 +85,124 @@ def index_meili(json_path: Path): } r = requests.post(f"{MEILI_URL}/indexes/library/documents", headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type":"application/json"}, - data=__import__('orjson').dumps(payload)) + data=orjson.dumps(payload)) r.raise_for_status() +import tldextract, trafilatura, requests as _requests + +def slugify(text): + text = re.sub(r'[^A-Za-z0-9\-._ ]+', '', text).strip().replace(' ', '_') + return text[:120] or 'page' + +def save_web_snapshot(url: str): + r = _requests.get(url, timeout=30, headers={"User-Agent":"Mozilla/5.0"}) + r.raise_for_status() + html = r.text + downloaded = trafilatura.load_html(html, url=url) + text = trafilatura.extract(downloaded, include_comments=False, include_images=False, with_metadata=True) or "" + meta = trafilatura.metadata.extract_metadata(downloaded) or None + title = (meta.title if meta and getattr(meta, 'title', None) else None) or (re.search(r']*>(.*?)', html, re.I|re.S).group(1).strip() if re.search(r']*>(.*?)', html, re.I|re.S) else url) + date = (meta.date if meta and getattr(meta, 'date', None) else "") + parts = tldextract.extract(url) + domain = ".".join([p for p in [parts.domain, parts.suffix] if p]) + slug = slugify(title) + outdir = LIB / "web" / domain + outdir.mkdir(parents=True, exist_ok=True) + base = outdir / slug + open(base.with_suffix(".html"), "w", encoding="utf-8", errors="ignore").write(html) + open(base.with_suffix(".txt"), "w", encoding="utf-8", errors="ignore").write(text) + return base, title, domain, date, text + +def index_web(base: Path, title: str, domain: str, date: str, text: str, url: str): + payload = { + "id": f"web:{domain}:{base.stem}", + "type": "web", + "title": title, + "date": re.sub(r'[^0-9]', '', date)[:8] if date else "", + "source": f"file://{str(base.with_suffix('.html'))}", + "text": text, + "segments": [], + "meta": {"url": url, "domain": domain} + } + r = requests.post(f"{MEILI_URL}/indexes/library/documents", + headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type":"application/json"}, + data=orjson.dumps(payload)) + r.raise_for_status() + +def is_media_url(url: str): + lowered = url.lower() + media_hosts = ["youtube.com","youtu.be","rumble.com","vimeo.com","soundcloud.com","spotify.com","podbean.com","buzzsprout.com"] + return any(h in lowered for h in media_hosts) + +def owui_headers(): + return {"Authorization": f"Bearer {OWUI_KEY}"} if OWUI_KEY else {} + +def owui_get_or_create_kb(): + if not OWUI_URL or not OWUI_KEY: + return None + try: + r = requests.get(f"{OWUI_URL}/api/v1/knowledge/list", headers=owui_headers(), timeout=15) + r.raise_for_status() + for kb in r.json().get("data", []): + if kb.get("name") == OWUI_KB: + return kb["id"] + except Exception: + pass + r = requests.post( + f"{OWUI_URL}/api/v1/knowledge/create", + headers={**owui_headers(), "Content-Type": "application/json"}, + data=orjson.dumps({"name": OWUI_KB, "description": "All local content indexed by podx"}), + timeout=15, + ) + r.raise_for_status() + return r.json()["data"]["id"] + +def owui_upload_and_attach(path: Path, kb_id: str): + with open(path, "rb") as f: + r = requests.post(f"{OWUI_URL}/api/v1/files/", headers=owui_headers(), files={"file": (path.name, f)}, timeout=60*10) + r.raise_for_status() + file_id = r.json()["data"]["id"] + r = requests.post( + f"{OWUI_URL}/api/v1/knowledge/{kb_id}/file/add", + headers={**owui_headers(), "Content-Type": "application/json"}, + data=orjson.dumps({"file_id": file_id}), + timeout=60, + ) + r.raise_for_status() + return True + +def publish_to_openwebui(paths): + if not OWUI_URL or not OWUI_KEY: + return + try: + kb_id = owui_get_or_create_kb() + for p in paths: + p = Path(p) + if not p.exists(): + continue + try: + owui_upload_and_attach(p, kb_id) + except Exception as e: + log({"url": str(p), "status": "owui_error", "error": str(e)}) + except Exception as e: + log({"status": "owui_error", "error": str(e)}) + +def handle_web(url: str): + info = {"url": url, "status":"web-downloading", "title":"", "uploader":"", "date":"", "path":""} + log(info) + base, title, domain, date, text = save_web_snapshot(url) + info.update({"title": title, "uploader": domain, "date": date, "path": str(base.with_suffix('.html'))}) + log({**info, **{"status":"web-indexing"}}) + index_web(base, title, domain, date, text, url) + push = [p for p in [base.with_suffix('.txt'), base.with_suffix('.html')] if p.exists()] + publish_to_openwebui(push) + log({**info, **{"status":"done"}}) + def handle_url(url: str): try: + if not is_media_url(url): + handle_web(url) + return info = {"url": url, "status":"queued", "title":"", "uploader":"", "date":"", "path":""} log({**info, **{"status":"downloading"}}) files = yt_dlp(url, TMP) @@ -96,13 +213,13 @@ def handle_url(url: str): dest_dir.mkdir(parents=True, exist_ok=True) dest = dest_dir / sanitize(f.name) shutil.move(str(f), dest) - import re as _re info.update({"title": dest.stem, "uploader": uploader, - "date": _re.findall(r"\b(\d{8})\b", dest.stem)[0] if _re.findall(r"\b(\d{8})\b", dest.stem) else "", + "date": (re.findall(r"\b(\d{8})\b", dest.stem)[0] if re.findall(r"\b(\d{8})\b", dest.stem) else ""), "path": str(dest)}) log({**info, **{"status":"transcribing"}}) base = transcribe(dest) index_meili(base.with_suffix(".json")) + publish_to_openwebui([base.with_suffix(".txt")]) log({**info, **{"status":"done"}}) except Exception as e: log({"url": url, "status":"error", "error": str(e)}) diff --git a/docker-compose.yml b/docker-compose.yml index 27b0614..1f01224 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,38 +1,41 @@ +version: "3.9" services: - web: + podx-web: build: ./app container_name: podx-web + env_file: [.env] environment: - - MEILI_URL=http://meili:7700 - - MEILI_KEY=devkey - - REDIS_URL=redis://redis:6379/0 - - LIBRARY_ROOT=/library - - TRANSCRIPT_ROOT=/transcripts - - TMP_ROOT=/tmpdl - - WHISPER_MODEL=large-v3 - - WHISPER_PRECISION=int8 + MEILI_URL: http://meili:7700 + REDIS_URL: redis://redis:6379/0 + LIBRARY_ROOT: /library + TRANSCRIPT_ROOT: /transcripts + TMP_ROOT: /tmpdl + WHISPER_MODEL: large-v3 + WHISPER_PRECISION: int8 volumes: - ./library:/library - ./transcripts:/transcripts - ./tmp:/tmpdl - ./models:/root/.cache/huggingface ports: ["8088:8080"] - depends_on: [worker, meili, redis] + depends_on: [podx-worker, meili, redis] restart: unless-stopped + extra_hosts: + - host.docker.internal:host-gateway - worker: + podx-worker: build: ./app container_name: podx-worker command: ["python", "worker.py"] + env_file: [.env] environment: - - MEILI_URL=http://meili:7700 - - MEILI_KEY=devkey - - REDIS_URL=redis://redis:6379/0 - - LIBRARY_ROOT=/library - - TRANSCRIPT_ROOT=/transcripts - - TMP_ROOT=/tmpdl - - WHISPER_MODEL=large-v3 - - WHISPER_PRECISION=int8 + MEILI_URL: http://meili:7700 + REDIS_URL: redis://redis:6379/0 + LIBRARY_ROOT: /library + TRANSCRIPT_ROOT: /transcripts + TMP_ROOT: /tmpdl + WHISPER_MODEL: large-v3 + WHISPER_PRECISION: int8 volumes: - ./library:/library - ./transcripts:/transcripts @@ -40,13 +43,15 @@ services: - ./models:/root/.cache/huggingface depends_on: [meili, redis] restart: unless-stopped + extra_hosts: + - host.docker.internal:host-gateway meili: image: getmeili/meilisearch:v1.8 container_name: meili + env_file: [.env] environment: - - MEILI_MASTER_KEY=devkey - - MEILI_NO_ANALYTICS=true + MEILI_NO_ANALYTICS: "true" ports: ["7700:7700"] volumes: - ./data/meili:/meili_data diff --git a/ingest/ingest_epub.py b/ingest/ingest_epub.py index 2e6fce9..dc8df33 100755 --- a/ingest/ingest_epub.py +++ b/ingest/ingest_epub.py @@ -5,7 +5,7 @@ from bs4 import BeautifulSoup import requests MEILI_URL = os.getenv("MEILI_URL","http://localhost:7700") -MEILI_KEY = os.getenv("MEILI_KEY","devkey") +MEILI_KEY = os.getenv("MEILI_KEY","change_me") def post(doc): r = requests.post(f"{MEILI_URL}/indexes/library/documents", @@ -26,7 +26,7 @@ for path in sys.argv[1:]: doc = { "id": hashlib.sha1((path+item.get_name()).encode()).hexdigest(), "type": "epub", - "title": f"{title} — {item.get_name()}", + "title": f"{title} - {item.get_name()}", "source": f"file://{os.path.abspath(path)}", "date": "", "text": text, diff --git a/ingest/ingest_kiwix.sh b/ingest/ingest_kiwix.sh index cd0b529..fa159f9 100755 --- a/ingest/ingest_kiwix.sh +++ b/ingest/ingest_kiwix.sh @@ -2,7 +2,7 @@ set -euo pipefail ZIM="$1" BASE_URL=${MEILI_URL:-http://localhost:7700} -KEY=${MEILI_KEY:-devkey} +KEY=${MEILI_KEY:-change_me} zimdump list "$ZIM" --json | jq -rc '.[] | select(.mimetype=="text/html") | .path' | while read -r path; do html="$(zimdump dump "$ZIM" "$path" 2>/dev/null || true)" diff --git a/ingest/ingest_pdfs.sh b/ingest/ingest_pdfs.sh index 216b5b4..3cac4b6 100755 --- a/ingest/ingest_pdfs.sh +++ b/ingest/ingest_pdfs.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -euo pipefail BASE_URL=${MEILI_URL:-http://localhost:7700} -KEY=${MEILI_KEY:-devkey} +KEY=${MEILI_KEY:-change_me} for pdf in "$@"; do title="$(basename "$pdf")" diff --git a/scripts/backfill_openwebui.py b/scripts/backfill_openwebui.py new file mode 100755 index 0000000..a6cb0ca --- /dev/null +++ b/scripts/backfill_openwebui.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +import os, sys +from pathlib import Path +import requests, orjson + +OWUI_URL = os.getenv("OPENWEBUI_URL", "").rstrip("/") +OWUI_KEY = os.getenv("OPENWEBUI_API_KEY", "") +OWUI_KB = os.getenv("OPENWEBUI_KB_NAME", "Homelab Library") + +LIB = Path(os.getenv("LIBRARY_ROOT", "./library")) +TRN = Path(os.getenv("TRANSCRIPT_ROOT", "./transcripts")) + +def headers(): + return {"Authorization": f"Bearer {OWUI_KEY}"} if OWUI_KEY else {} + +def get_or_create_kb(): + if not OWUI_URL or not OWUI_KEY: + print("OpenWebUI not configured.") + sys.exit(1) + r = requests.get(f"{OWUI_URL}/api/v1/knowledge/list", headers=headers(), timeout=15) + r.raise_for_status() + for kb in r.json().get("data", []): + if kb.get("name") == OWUI_KB: + return kb["id"] + r = requests.post(f"{OWUI_URL}/api/v1/knowledge/create", + headers={**headers(), "Content-Type":"application/json"}, + data=orjson.dumps({"name": OWUI_KB, "description": "All local content indexed by podx"})) + r.raise_for_status() + return r.json()["data"]["id"] + +def upload_and_attach(path: Path, kb_id: str): + with open(path, "rb") as f: + r = requests.post(f"{OWUI_URL}/api/v1/files/", headers=headers(), files={"file": (path.name, f)}, timeout=60*10) + r.raise_for_status() + file_id = r.json()["data"]["id"] + r = requests.post(f"{OWUI_URL}/api/v1/knowledge/{kb_id}/file/add", + headers={**headers(), "Content-Type":"application/json"}, + data=orjson.dumps({"file_id": file_id}), timeout=60) + r.raise_for_status() + print(f"Uploaded {path}") + +def main(): + kb_id = get_or_create_kb() + # transcripts + for txt in TRN.glob("*.txt"): + upload_and_attach(txt, kb_id) + # web snapshots + for txt in LIB.glob("web/**/*.txt"): + upload_and_attach(txt, kb_id) + +if __name__ == "__main__": + main() diff --git a/tools/backfill_openwebui.sh b/tools/backfill_openwebui.sh new file mode 100755 index 0000000..be8812b --- /dev/null +++ b/tools/backfill_openwebui.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +# backfill_openwebui.sh +# Upload existing local files (transcripts, web snapshots, other .txt/.md/.html) +# into your OpenWebUI Knowledge Base using values from .env (OPENWEBUI_URL, OPENWEBUI_API_KEY, OPENWEBUI_KB_NAME). +# +# Usage: +# ./tools/backfill_openwebui.sh # default paths (./transcripts and ./library/web) +# ./tools/backfill_openwebui.sh /extra/folder1 ... # optional extra folders to scan +# +# Requirements: bash, curl, jq, find, xargs +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +ENV_FILE="$ROOT_DIR/.env" + +if [[ ! -f "$ENV_FILE" ]]; then + echo "ERROR: .env not found in repo root. Copy .env.example to .env and fill values." + exit 1 +fi + +# Load .env (simple parser for KEY=VALUE, ignoring comments) +export $(grep -E '^[A-Za-z_][A-Za-z0-9_]*=' "$ENV_FILE" | sed 's/#.*//' | xargs -0 echo) + +: "${OPENWEBUI_URL:?Set OPENWEBUI_URL in .env}" +: "${OPENWEBUI_API_KEY:?Set OPENWEBUI_API_KEY in .env}" +KB_NAME="${OPENWEBUI_KB_NAME:-Homelab Library}" + +BASE="$OPENWEBUI_URL" +KEY="$OPENWEBUI_API_KEY" + +command -v jq >/dev/null || { echo "ERROR: jq is required"; exit 1; } +command -v curl >/dev/null || { echo "ERROR: curl is required"; exit 1; } + +# Resolve or create the Knowledge Base +echo "Resolving Knowledge Base: ${KB_NAME}" +KB_ID="$(curl -s -H "Authorization: Bearer $KEY" "$BASE/api/v1/knowledge/list" \ + | jq -r --arg KB "$KB_NAME" '.data[] | select(.name==$KB) | .id' || true)" + +if [[ -z "$KB_ID" || "$KB_ID" == "null" ]]; then + echo "Creating Knowledge Base: ${KB_NAME}" + KB_ID="$(curl -s -X POST "$BASE/api/v1/knowledge/create" \ + -H "Authorization: Bearer $KEY" -H "Content-Type: application/json" \ + -d "{\"name\":\"$KB_NAME\",\"description\":\"All local content indexed by podx\"}" \ + | jq -r '.data.id')" +fi + +if [[ -z "$KB_ID" || "$KB_ID" == "null" ]]; then + echo "ERROR: Could not get or create KB"; exit 1 +fi + +upload_and_attach() { + local file="$1" + local fname + fname="$(basename "$file")" + echo "Uploading: $file" + FID="$(curl -s -X POST "$BASE/api/v1/files/" \ + -H "Authorization: Bearer $KEY" \ + -F "file=@${file};filename=${fname}" | jq -r '.data.id')" + if [[ -z "$FID" || "$FID" == "null" ]]; then + echo "WARN: upload failed for $file"; return 0 + fi + curl -s -X POST "$BASE/api/v1/knowledge/$KB_ID/file/add" \ + -H "Authorization: Bearer $KEY" -H "Content-Type: application/json" \ + -d "{\"file_id\":\"$FID\"}" >/dev/null || { + echo "WARN: attach failed for $file" + } +} + +# Default folders +declare -a SCAN_DIRS +SCAN_DIRS+=( "$ROOT_DIR/transcripts" ) +SCAN_DIRS+=( "$ROOT_DIR/library/web" ) + +# Additional user-provided folders +if (( "$#" > 0 )); then + for d in "$@"; do + SCAN_DIRS+=( "$d" ) + done +fi + +# Patterns to include +INCLUDE_PATTERNS=( -name '*.txt' -o -name '*.md' -o -name '*.html' -o -name '*.htm' ) + +# Iterate folders +for D in "${SCAN_DIRS[@]}"; do + if [[ -d "$D" ]]; then + echo "Scanning: $D" + # Use -print0 to handle spaces; upload sequentially + # shellcheck disable=SC2068 + find "$D" -type f \( ${INCLUDE_PATTERNS[@]} \) -print0 | \ + xargs -0 -I{} bash -c 'upload_and_attach "$@"' _ {} + else + echo "Skip (not a directory): $D" + fi +done + +echo "Backfill finished. Check OpenWebUI → Knowledge → ${KB_NAME}."