231 lines
9.4 KiB
YAML
231 lines
9.4 KiB
YAML
name: Weekly docs refresh
|
|
|
|
# Runs the full pipeline: scrape upstream → rebuild indexes → push
|
|
# image. Cron'd weekly (Mondays). Skip the reindex + image-push if the
|
|
# scrape produced no diff against the committed corpus.
|
|
#
|
|
# IMPORTANT: actions/checkout@v4 fetch-depth: 0 is required because
|
|
# the digest-history step walks git log up to --history-days back.
|
|
# With a shallow checkout the history file ships empty.
|
|
|
|
on:
|
|
schedule:
|
|
- cron: "0 6 * * 1" # Mondays 06:00 UTC
|
|
workflow_dispatch:
|
|
inputs:
|
|
force_build:
|
|
description: "Rebuild indexes + push image even if corpus is unchanged"
|
|
type: boolean
|
|
default: false
|
|
|
|
env:
|
|
# PUSH goes to the LAN endpoint (HTTP) to bypass Cloudflare Tunnel's
|
|
# 100 MB body cap. PULL uses the public hostname (HTTPS). Same Gitea
|
|
# registry either way — package lands under the same owner/repo.
|
|
REGISTRY_PUSH: 192.168.0.2:1234
|
|
REGISTRY_PULL: git.jpaul.io
|
|
|
|
# Image name derives from the repo at runtime — clones don't need to
|
|
# edit this. github.* is the Gitea-Actions inherited namespace.
|
|
IMAGE: ${{ github.repository_owner }}/${{ github.event.repository.name }}
|
|
|
|
# Two GPU-pinned Ollama containers on the Gitea host — same infra
|
|
# zerto-docs uses (deploy/ollama-rag.docker-compose.yml over there).
|
|
# :11435 owns the Titan X, :11436 owns the 1080 Ti; the indexer
|
|
# round-robins per batch so both cards run in parallel. The host's
|
|
# primary Ollama on :11434 is left alone for OpenWebUI etc.
|
|
OLLAMA_URLS: http://192.168.0.2:11435,http://192.168.0.2:11436,http://192.168.0.125:11434,http://192.168.0.126:11434
|
|
EMBED_MODEL: nomic-embed-text
|
|
|
|
PRODUCT_NAME: morpheus
|
|
|
|
jobs:
|
|
refresh:
|
|
runs-on: docker
|
|
container:
|
|
image: catthehacker/ubuntu:act-latest
|
|
steps:
|
|
- name: Checkout
|
|
uses: actions/checkout@v4
|
|
with:
|
|
# Full history — required for digest-history. Default depth 1
|
|
# silently produces a 0-byte history file.
|
|
fetch-depth: 0
|
|
# Set the credentials Gitea injects so we can push corpus
|
|
# commits back. Persist them across the run.
|
|
token: ${{ secrets.GITEA_TOKEN }}
|
|
|
|
- name: Set up Python
|
|
uses: actions/setup-python@v5
|
|
with:
|
|
python-version: "3.12"
|
|
|
|
- name: Install dependencies
|
|
run: |
|
|
python -m pip install -q --upgrade pip
|
|
python -m pip install -q -r requirements.txt
|
|
|
|
# ---- Phase 1: scrape ---------------------------------------
|
|
- name: Refresh bundle catalog
|
|
run: python -m scrape.bundles
|
|
|
|
- name: Re-scrape all bundles
|
|
# --force re-fetches every page so we actually see upstream
|
|
# edits. Without it the runner skips pages already on disk.
|
|
run: python -m scrape.runner --all --force --concurrency 6
|
|
|
|
# ---- Build the digest history BEFORE committing ------------
|
|
# See PLAN.md Phase 13. Walks recent corpus-touching commits
|
|
# and writes corpus/.digest/history.jsonl. The current refresh
|
|
# gets added on the NEXT run's history (one-week lag is fine).
|
|
- name: Build digest history
|
|
run: |
|
|
mkdir -p corpus/.digest
|
|
python -m scrape.changelog \
|
|
--history-out corpus/.digest/history.jsonl \
|
|
--history-days 120
|
|
|
|
# ---- Commit + retry-on-race --------------------------------
|
|
- name: Commit corpus changes (if any)
|
|
id: commit
|
|
run: |
|
|
git config user.name "hvm-docs-refresh"
|
|
git config user.email "actions@jpaul.io"
|
|
git add bundles.json corpus
|
|
if git diff --cached --quiet; then
|
|
echo "no corpus changes — skipping reindex and image build"
|
|
echo "changed=false" >> "$GITHUB_OUTPUT"
|
|
exit 0
|
|
fi
|
|
echo "changed=true" >> "$GITHUB_OUTPUT"
|
|
python -m scrape.changelog --cached > /tmp/changelog.txt
|
|
summary=$(head -1 /tmp/changelog.txt)
|
|
ts=$(date -u +"%Y-%m-%dT%H:%MZ")
|
|
{
|
|
echo "weekly refresh: ${ts} — ${summary}"
|
|
echo ""
|
|
cat /tmp/changelog.txt
|
|
} > /tmp/commitmsg.txt
|
|
git commit -F /tmp/commitmsg.txt
|
|
# Retry on race: if main moved while we were scraping (a
|
|
# human merged a PR during the run), `git push` rejects
|
|
# with "fetch first". Rebase our corpus commit onto new
|
|
# main and retry. Corpus + code paths are disjoint, so
|
|
# the rebase is trivially clean.
|
|
attempt=1
|
|
while [ $attempt -le 3 ]; do
|
|
if git push; then
|
|
echo "pushed corpus changes (attempt $attempt)"
|
|
break
|
|
fi
|
|
if [ $attempt -eq 3 ]; then
|
|
echo "push still failing after 3 attempts — bailing"
|
|
exit 1
|
|
fi
|
|
git fetch origin main
|
|
git rebase origin/main || { echo "rebase conflict — bailing"; exit 1; }
|
|
attempt=$((attempt + 1))
|
|
done
|
|
|
|
# ---- Reindex Chroma + BM25 ---------------------------------
|
|
- name: Rebuild indexes
|
|
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
|
|
run: python -m rag.index --rebuild
|
|
|
|
# ---- Build & push image (LAN endpoint, buildx) -------------
|
|
- name: Set up Docker Buildx
|
|
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
|
|
uses: docker/setup-buildx-action@v3
|
|
with:
|
|
# LAN registry is HTTP only. Buildkit needs an explicit
|
|
# insecure-registry config or it tries to upgrade to HTTPS.
|
|
config-inline: |
|
|
[registry."192.168.0.2:1234"]
|
|
http = true
|
|
insecure = true
|
|
|
|
- name: Configure registry credentials for buildx
|
|
# Can't use docker/login-action against the LAN endpoint —
|
|
# the host docker daemon errors on HTTP-vs-HTTPS. Buildx reads
|
|
# ~/.docker/config.json directly, so write the auth ourselves.
|
|
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
|
|
env:
|
|
REGISTRY_TOKEN: ${{ secrets.REGISTRY_TOKEN }}
|
|
REGISTRY_USER: ${{ github.actor }}
|
|
run: |
|
|
mkdir -p ~/.docker
|
|
AUTH=$(printf '%s:%s' "$REGISTRY_USER" "$REGISTRY_TOKEN" | base64 -w0)
|
|
cat > ~/.docker/config.json <<EOF
|
|
{
|
|
"auths": {
|
|
"192.168.0.2:1234": {
|
|
"auth": "$AUTH"
|
|
}
|
|
}
|
|
}
|
|
EOF
|
|
|
|
- name: Compute tags
|
|
id: meta
|
|
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
|
|
uses: docker/metadata-action@v5
|
|
with:
|
|
# Tag with the LAN hostname so the push goes over LAN.
|
|
# docker-compose on the deploy host pulls via git.jpaul.io.
|
|
images: 192.168.0.2:1234/${{ github.repository_owner }}/${{ github.event.repository.name }}
|
|
tags: |
|
|
type=raw,value=latest
|
|
type=sha,prefix=,format=short
|
|
type=schedule,pattern={{date 'YYYY.MM.DD'}}
|
|
type=raw,value={{date 'YYYY.MM.DD'}}
|
|
# Override auto-derived labels with the PUBLIC URL so Gitea
|
|
# can auto-link the package back to this repo.
|
|
labels: |
|
|
org.opencontainers.image.source=https://git.jpaul.io/${{ github.repository_owner }}/${{ github.event.repository.name }}
|
|
org.opencontainers.image.url=https://git.jpaul.io/${{ github.repository_owner }}/${{ github.event.repository.name }}
|
|
|
|
- name: Build & push (amd64)
|
|
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
|
|
uses: docker/build-push-action@v6
|
|
with:
|
|
context: .
|
|
platforms: linux/amd64
|
|
push: true
|
|
tags: ${{ steps.meta.outputs.tags }}
|
|
labels: ${{ steps.meta.outputs.labels }}
|
|
|
|
- name: Link container package to this repo
|
|
# Idempotent linkage so the package shows under the repo's
|
|
# Packages tab. Gitea's auto-link from the source label is
|
|
# unreliable in this setup (the runner reports an internal
|
|
# server URL), so we link explicitly. 201 = newly linked,
|
|
# 400 = already linked (treated as success).
|
|
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
|
|
env:
|
|
GITEA_TOKEN: ${{ secrets.REGISTRY_TOKEN }}
|
|
run: |
|
|
OWNER="${{ github.repository_owner }}"
|
|
PKG="${{ github.event.repository.name }}"
|
|
code=$(curl -s -o /tmp/link.out -w "%{http_code}" -X POST \
|
|
-H "Authorization: token ${GITEA_TOKEN}" \
|
|
"https://git.jpaul.io/api/v1/packages/${OWNER}/container/${PKG}/-/link/${PKG}")
|
|
echo "link ${OWNER}/container/${PKG} -> ${PKG}: HTTP ${code}"
|
|
body=$(cat /tmp/link.out)
|
|
case "$code" in
|
|
201) echo "OK — newly linked" ;;
|
|
400|409) echo "OK — already linked: ${body}" ;;
|
|
*) echo "unexpected: ${body}"; exit 1 ;;
|
|
esac
|
|
|
|
# ---- Registry GC -------------------------------------------
|
|
- name: Prune old container versions
|
|
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
|
|
env:
|
|
GITEA_TOKEN: ${{ secrets.REGISTRY_TOKEN }}
|
|
run: |
|
|
python scripts/registry_gc.py \
|
|
--owner "${{ github.repository_owner }}" \
|
|
--package "${{ github.event.repository.name }}" \
|
|
--keep-days 90 \
|
|
--keep-latest 5
|