Files
morpheus-docs/.gitea/workflows/refresh.yml
T

231 lines
9.4 KiB
YAML

name: Weekly docs refresh
# Runs the full pipeline: scrape upstream → rebuild indexes → push
# image. Cron'd weekly (Mondays). Skip the reindex + image-push if the
# scrape produced no diff against the committed corpus.
#
# IMPORTANT: actions/checkout@v4 fetch-depth: 0 is required because
# the digest-history step walks git log up to --history-days back.
# With a shallow checkout the history file ships empty.
on:
schedule:
- cron: "0 6 * * 1" # Mondays 06:00 UTC
workflow_dispatch:
inputs:
force_build:
description: "Rebuild indexes + push image even if corpus is unchanged"
type: boolean
default: false
env:
# PUSH goes to the LAN endpoint (HTTP) to bypass Cloudflare Tunnel's
# 100 MB body cap. PULL uses the public hostname (HTTPS). Same Gitea
# registry either way — package lands under the same owner/repo.
REGISTRY_PUSH: 192.168.0.2:1234
REGISTRY_PULL: git.jpaul.io
# Image name derives from the repo at runtime — clones don't need to
# edit this. github.* is the Gitea-Actions inherited namespace.
IMAGE: ${{ github.repository_owner }}/${{ github.event.repository.name }}
# Two GPU-pinned Ollama containers on the Gitea host — same infra
# zerto-docs uses (deploy/ollama-rag.docker-compose.yml over there).
# :11435 owns the Titan X, :11436 owns the 1080 Ti; the indexer
# round-robins per batch so both cards run in parallel. The host's
# primary Ollama on :11434 is left alone for OpenWebUI etc.
OLLAMA_URLS: http://192.168.0.2:11435,http://192.168.0.2:11436,http://192.168.0.125:11434,http://192.168.0.126:11434
EMBED_MODEL: nomic-embed-text
PRODUCT_NAME: morpheus
jobs:
refresh:
runs-on: docker
container:
image: catthehacker/ubuntu:act-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
# Full history — required for digest-history. Default depth 1
# silently produces a 0-byte history file.
fetch-depth: 0
# Set the credentials Gitea injects so we can push corpus
# commits back. Persist them across the run.
token: ${{ secrets.GITEA_TOKEN }}
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install dependencies
run: |
python -m pip install -q --upgrade pip
python -m pip install -q -r requirements.txt
# ---- Phase 1: scrape ---------------------------------------
- name: Refresh bundle catalog
run: python -m scrape.bundles
- name: Re-scrape all bundles
# --force re-fetches every page so we actually see upstream
# edits. Without it the runner skips pages already on disk.
run: python -m scrape.runner --all --force --concurrency 6
# ---- Build the digest history BEFORE committing ------------
# See PLAN.md Phase 13. Walks recent corpus-touching commits
# and writes corpus/.digest/history.jsonl. The current refresh
# gets added on the NEXT run's history (one-week lag is fine).
- name: Build digest history
run: |
mkdir -p corpus/.digest
python -m scrape.changelog \
--history-out corpus/.digest/history.jsonl \
--history-days 120
# ---- Commit + retry-on-race --------------------------------
- name: Commit corpus changes (if any)
id: commit
run: |
git config user.name "hvm-docs-refresh"
git config user.email "actions@jpaul.io"
git add bundles.json corpus
if git diff --cached --quiet; then
echo "no corpus changes — skipping reindex and image build"
echo "changed=false" >> "$GITHUB_OUTPUT"
exit 0
fi
echo "changed=true" >> "$GITHUB_OUTPUT"
python -m scrape.changelog --cached > /tmp/changelog.txt
summary=$(head -1 /tmp/changelog.txt)
ts=$(date -u +"%Y-%m-%dT%H:%MZ")
{
echo "weekly refresh: ${ts} — ${summary}"
echo ""
cat /tmp/changelog.txt
} > /tmp/commitmsg.txt
git commit -F /tmp/commitmsg.txt
# Retry on race: if main moved while we were scraping (a
# human merged a PR during the run), `git push` rejects
# with "fetch first". Rebase our corpus commit onto new
# main and retry. Corpus + code paths are disjoint, so
# the rebase is trivially clean.
attempt=1
while [ $attempt -le 3 ]; do
if git push; then
echo "pushed corpus changes (attempt $attempt)"
break
fi
if [ $attempt -eq 3 ]; then
echo "push still failing after 3 attempts — bailing"
exit 1
fi
git fetch origin main
git rebase origin/main || { echo "rebase conflict — bailing"; exit 1; }
attempt=$((attempt + 1))
done
# ---- Reindex Chroma + BM25 ---------------------------------
- name: Rebuild indexes
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
run: python -m rag.index --rebuild
# ---- Build & push image (LAN endpoint, buildx) -------------
- name: Set up Docker Buildx
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
uses: docker/setup-buildx-action@v3
with:
# LAN registry is HTTP only. Buildkit needs an explicit
# insecure-registry config or it tries to upgrade to HTTPS.
config-inline: |
[registry."192.168.0.2:1234"]
http = true
insecure = true
- name: Configure registry credentials for buildx
# Can't use docker/login-action against the LAN endpoint —
# the host docker daemon errors on HTTP-vs-HTTPS. Buildx reads
# ~/.docker/config.json directly, so write the auth ourselves.
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
env:
REGISTRY_TOKEN: ${{ secrets.REGISTRY_TOKEN }}
REGISTRY_USER: ${{ github.actor }}
run: |
mkdir -p ~/.docker
AUTH=$(printf '%s:%s' "$REGISTRY_USER" "$REGISTRY_TOKEN" | base64 -w0)
cat > ~/.docker/config.json <<EOF
{
"auths": {
"192.168.0.2:1234": {
"auth": "$AUTH"
}
}
}
EOF
- name: Compute tags
id: meta
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
uses: docker/metadata-action@v5
with:
# Tag with the LAN hostname so the push goes over LAN.
# docker-compose on the deploy host pulls via git.jpaul.io.
images: 192.168.0.2:1234/${{ github.repository_owner }}/${{ github.event.repository.name }}
tags: |
type=raw,value=latest
type=sha,prefix=,format=short
type=schedule,pattern={{date 'YYYY.MM.DD'}}
type=raw,value={{date 'YYYY.MM.DD'}}
# Override auto-derived labels with the PUBLIC URL so Gitea
# can auto-link the package back to this repo.
labels: |
org.opencontainers.image.source=https://git.jpaul.io/${{ github.repository_owner }}/${{ github.event.repository.name }}
org.opencontainers.image.url=https://git.jpaul.io/${{ github.repository_owner }}/${{ github.event.repository.name }}
- name: Build & push (amd64)
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
uses: docker/build-push-action@v6
with:
context: .
platforms: linux/amd64
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
- name: Link container package to this repo
# Idempotent linkage so the package shows under the repo's
# Packages tab. Gitea's auto-link from the source label is
# unreliable in this setup (the runner reports an internal
# server URL), so we link explicitly. 201 = newly linked,
# 400 = already linked (treated as success).
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
env:
GITEA_TOKEN: ${{ secrets.REGISTRY_TOKEN }}
run: |
OWNER="${{ github.repository_owner }}"
PKG="${{ github.event.repository.name }}"
code=$(curl -s -o /tmp/link.out -w "%{http_code}" -X POST \
-H "Authorization: token ${GITEA_TOKEN}" \
"https://git.jpaul.io/api/v1/packages/${OWNER}/container/${PKG}/-/link/${PKG}")
echo "link ${OWNER}/container/${PKG} -> ${PKG}: HTTP ${code}"
body=$(cat /tmp/link.out)
case "$code" in
201) echo "OK — newly linked" ;;
400|409) echo "OK — already linked: ${body}" ;;
*) echo "unexpected: ${body}"; exit 1 ;;
esac
# ---- Registry GC -------------------------------------------
- name: Prune old container versions
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
env:
GITEA_TOKEN: ${{ secrets.REGISTRY_TOKEN }}
run: |
python scripts/registry_gc.py \
--owner "${{ github.repository_owner }}" \
--package "${{ github.event.repository.name }}" \
--keep-days 90 \
--keep-latest 5