Initial commit
This commit is contained in:
16
ingest/ingest_pdfs.sh
Executable file
16
ingest/ingest_pdfs.sh
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
BASE_URL=${MEILI_URL:-http://localhost:7700}
|
||||
KEY=${MEILI_KEY:-devkey}
|
||||
|
||||
for pdf in "$@"; do
|
||||
title="$(basename "$pdf")"
|
||||
pages=$(pdfinfo "$pdf" | awk '/Pages:/ {print $2}')
|
||||
for p in $(seq 1 "$pages"); do
|
||||
text="$(pdftotext -f $p -l $p -layout "$pdf" - | sed 's/^[[:space:]]*$//' )"
|
||||
[ -z "$text" ] && continue
|
||||
doc=$(jq -nc --arg id "${title}-p${p}" --arg t "$title" --arg src "file://$pdf" --arg txt "$text" '{id:$id, type:"pdf", title:$t, source:$src, date:"", text:$txt, meta:{page:$id}}')
|
||||
curl -sS -X POST "$BASE_URL/indexes/library/documents" -H "Authorization: Bearer '"$KEY"'" -H 'Content-Type: application/json' --data-binary "$doc" >/dev/null
|
||||
done
|
||||
echo "Indexed $title ($pages pages)"
|
||||
done
|
Reference in New Issue
Block a user