Initial commit

This commit is contained in:
2025-09-04 15:21:07 +01:00
parent d87b02a7ac
commit bb0c5cc8ff
9 changed files with 446 additions and 1 deletions

36
ingest/ingest_epub.py Executable file
View File

@@ -0,0 +1,36 @@
#!/usr/bin/env python3
import sys, os, hashlib, json
from ebooklib import epub
from bs4 import BeautifulSoup
import requests
MEILI_URL = os.getenv("MEILI_URL","http://localhost:7700")
MEILI_KEY = os.getenv("MEILI_KEY","devkey")
def post(doc):
r = requests.post(f"{MEILI_URL}/indexes/library/documents",
headers={"Authorization": f"Bearer {MEILI_KEY}", "Content-Type":"application/json"},
data=json.dumps(doc))
r.raise_for_status()
for path in sys.argv[1:]:
book = epub.read_epub(path)
title = book.get_metadata('DC', 'title')[0][0] if book.get_metadata('DC','title') else os.path.basename(path)
author = "; ".join([a[0] for a in book.get_metadata('DC','creator')]) if book.get_metadata('DC','creator') else ""
n=0
for item in book.get_items_of_type(9):
soup = BeautifulSoup(item.get_body_content(), "lxml")
text = soup.get_text(separator=" ", strip=True)
if not text.strip(): continue
n+=1
doc = {
"id": hashlib.sha1((path+item.get_name()).encode()).hexdigest(),
"type": "epub",
"title": f"{title}{item.get_name()}",
"source": f"file://{os.path.abspath(path)}",
"date": "",
"text": text,
"meta": {"book_title": title, "book_author": author, "chapter": item.get_name()}
}
post(doc)
print(f"Indexed {title} ({n} sections)")

17
ingest/ingest_kiwix.sh Executable file
View File

@@ -0,0 +1,17 @@
#!/usr/bin/env bash
set -euo pipefail
ZIM="$1"
BASE_URL=${MEILI_URL:-http://localhost:7700}
KEY=${MEILI_KEY:-devkey}
zimdump list "$ZIM" --json | jq -rc '.[] | select(.mimetype=="text/html") | .path' | while read -r path; do
html="$(zimdump dump "$ZIM" "$path" 2>/dev/null || true)"
[ -z "$html" ] && continue
text="$(echo "$html" | sed -e 's/<[^>]*>/ /g' | tr -s ' ' ' ' | sed 's/^[[:space:]]*//')"
title="$(basename "$path" | sed 's/_/ /g')"
id="$(echo -n "${ZIM}:${path}" | sha1sum | awk '{print $1}')"
doc=$(jq -nc --arg id "$id" --arg t "$title" --arg src "zim://$ZIM$path" --arg txt "$text" '{id:$id, type:"kiwix", title:$t, source:$src, date:"", text:$txt, meta:{path:$src}}')
curl -sS -X POST "$BASE_URL/indexes/library/documents" -H "Authorization: Bearer '"$KEY"'" -H 'Content-Type: application/json' --data-binary "$doc" >/dev/null
done
echo "Indexed ZIM: $ZIM"

16
ingest/ingest_pdfs.sh Executable file
View File

@@ -0,0 +1,16 @@
#!/usr/bin/env bash
set -euo pipefail
BASE_URL=${MEILI_URL:-http://localhost:7700}
KEY=${MEILI_KEY:-devkey}
for pdf in "$@"; do
title="$(basename "$pdf")"
pages=$(pdfinfo "$pdf" | awk '/Pages:/ {print $2}')
for p in $(seq 1 "$pages"); do
text="$(pdftotext -f $p -l $p -layout "$pdf" - | sed 's/^[[:space:]]*$//' )"
[ -z "$text" ] && continue
doc=$(jq -nc --arg id "${title}-p${p}" --arg t "$title" --arg src "file://$pdf" --arg txt "$text" '{id:$id, type:"pdf", title:$t, source:$src, date:"", text:$txt, meta:{page:$id}}')
curl -sS -X POST "$BASE_URL/indexes/library/documents" -H "Authorization: Bearer '"$KEY"'" -H 'Content-Type: application/json' --data-binary "$doc" >/dev/null
done
echo "Indexed $title ($pages pages)"
done