From 34d30e3134f8b05ab164f13e8bf1be4196edfbce Mon Sep 17 00:00:00 2001 From: Justin Paul Date: Sat, 6 Jun 2026 21:46:09 -0400 Subject: [PATCH] Add media (object storage) and the background worker (Phase 1) Media model + migration; an ObjectStore interface with an S3/MinIO (boto3) implementation behind the service layer. Upload (multipart) stores bytes in object storage + a metadata row (checksum, size, content-type, optional attach to person/event/source); list returns presigned URLs; delete is soft. Editor-gated, privacy-filtered, audited. 24 tests pass (object store faked). Introduces the worker container (same image, 'python -m app.worker'): its first job is the scheduled 30-day soft-delete purge across tables + media object cleanup. Compose gains worker + S3 env on backend/worker; dev override builds the worker too. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Justin Paul --- backend/app/api/deps.py | 9 ++ backend/app/api/v1/__init__.py | 2 + backend/app/api/v1/media.py | 62 ++++++++++ backend/app/core/config.py | 12 ++ .../app/integrations/objectstore/__init__.py | 0 backend/app/integrations/objectstore/base.py | 22 ++++ backend/app/integrations/objectstore/s3.py | 56 +++++++++ backend/app/models/__init__.py | 2 + backend/app/models/media.py | 36 ++++++ backend/app/schemas/media.py | 22 ++++ backend/app/services/media_service.py | 107 ++++++++++++++++++ backend/app/worker.py | 103 +++++++++++++++++ .../migrations/versions/7fc7024ef432_media.py | 65 +++++++++++ backend/pyproject.toml | 6 + backend/tests/conftest.py | 23 +++- backend/tests/test_media.py | 45 ++++++++ backend/uv.lock | 92 +++++++++++++++ deploy/docker-compose.dev.yml | 5 + deploy/docker-compose.yml | 29 +++++ 19 files changed, 697 insertions(+), 1 deletion(-) create mode 100644 backend/app/api/v1/media.py create mode 100644 backend/app/integrations/objectstore/__init__.py create mode 100644 backend/app/integrations/objectstore/base.py create mode 100644 backend/app/integrations/objectstore/s3.py create mode 100644 backend/app/models/media.py create mode 100644 backend/app/schemas/media.py create mode 100644 backend/app/services/media_service.py create mode 100644 backend/app/worker.py create mode 100644 backend/migrations/versions/7fc7024ef432_media.py create mode 100644 backend/tests/test_media.py diff --git a/backend/app/api/deps.py b/backend/app/api/deps.py index 637809a..087a2de 100644 --- a/backend/app/api/deps.py +++ b/backend/app/api/deps.py @@ -10,6 +10,8 @@ from app.core.db import get_session from app.integrations.mailer.base import Mailer from app.integrations.mailer.console import ConsoleMailer from app.integrations.mailer.smtp import SMTPMailer +from app.integrations.objectstore.base import ObjectStore +from app.integrations.objectstore.s3 import S3ObjectStore from app.models.user import User from app.services import auth_service @@ -46,3 +48,10 @@ def get_mailer() -> Mailer: MailerDep = Annotated[Mailer, Depends(get_mailer)] + + +def get_objectstore() -> ObjectStore: + return S3ObjectStore(get_settings()) + + +ObjectStoreDep = Annotated[ObjectStore, Depends(get_objectstore)] diff --git a/backend/app/api/v1/__init__.py b/backend/app/api/v1/__init__.py index 5a67ae3..d29607a 100644 --- a/backend/app/api/v1/__init__.py +++ b/backend/app/api/v1/__init__.py @@ -6,6 +6,7 @@ from app.api.v1 import ( auth, citations, events, + media, persons, relationships, sources, @@ -22,3 +23,4 @@ api_router.include_router(events.router) api_router.include_router(relationships.router) api_router.include_router(sources.router) api_router.include_router(citations.router) +api_router.include_router(media.router) diff --git a/backend/app/api/v1/media.py b/backend/app/api/v1/media.py new file mode 100644 index 0000000..614be77 --- /dev/null +++ b/backend/app/api/v1/media.py @@ -0,0 +1,62 @@ +import uuid + +from fastapi import APIRouter, File, Form, UploadFile, status + +from app.api.deps import CurrentUser, ObjectStoreDep, SessionDep +from app.schemas.media import MediaRead +from app.services import media_service, tree_service + +router = APIRouter(prefix="/trees", tags=["media"]) + + +def _with_url(media, url: str) -> MediaRead: + out = MediaRead.model_validate(media) + out.url = url + return out + + +@router.post("/{tree_id}/media", response_model=MediaRead, status_code=status.HTTP_201_CREATED) +async def upload_media( + tree_id: uuid.UUID, + session: SessionDep, + current: CurrentUser, + store: ObjectStoreDep, + file: UploadFile = File(...), + title: str | None = Form(None), + person_id: uuid.UUID | None = Form(None), + event_id: uuid.UUID | None = Form(None), + source_id: uuid.UUID | None = Form(None), +) -> MediaRead: + tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id) + data = await file.read() + media = await media_service.upload_media( + session, + store, + actor=current, + tree=tree, + data=data, + filename=file.filename or "upload", + content_type=file.content_type or "application/octet-stream", + title=title, + person_id=person_id, + event_id=event_id, + source_id=source_id, + ) + return _with_url(media, await store.presigned_get_url(key=media.storage_key)) + + +@router.get("/{tree_id}/media", response_model=list[MediaRead]) +async def list_media( + tree_id: uuid.UUID, session: SessionDep, current: CurrentUser, store: ObjectStoreDep +) -> list[MediaRead]: + tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id) + items = await media_service.list_media(session, viewer_id=current.id, tree=tree) + return [_with_url(m, await store.presigned_get_url(key=m.storage_key)) for m in items] + + +@router.delete("/{tree_id}/media/{media_id}", status_code=status.HTTP_204_NO_CONTENT) +async def delete_media( + tree_id: uuid.UUID, media_id: uuid.UUID, session: SessionDep, current: CurrentUser +) -> None: + tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id) + await media_service.delete_media(session, actor=current, tree=tree, media_id=media_id) diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 0a8aecb..edf232d 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -35,6 +35,18 @@ class Settings(BaseSettings): # Base URL used to build links in outbound email. app_base_url: str = "http://localhost" + # --- Object storage (S3-compatible / MinIO) --- + s3_endpoint_url: str = "http://minio:9000" + s3_bucket: str = "provenance" + s3_access_key: str = "provenance" + s3_secret_key: str = "change-me-too" + s3_region: str = "us-east-1" + s3_presign_ttl: int = 3600 # seconds + + # --- Worker --- + purge_interval_seconds: int = 3600 # how often to run the soft-delete purge + purge_after_days: int = 30 # soft-deleted rows older than this are purged + # --- Email (SMTP) --- mailer: str = Field(default="console", description="console | smtp") smtp_host: str | None = None diff --git a/backend/app/integrations/objectstore/__init__.py b/backend/app/integrations/objectstore/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/integrations/objectstore/base.py b/backend/app/integrations/objectstore/base.py new file mode 100644 index 0000000..806d3aa --- /dev/null +++ b/backend/app/integrations/objectstore/base.py @@ -0,0 +1,22 @@ +"""ObjectStore interface — pluggable binary storage behind the service layer. + +Implementations are S3-compatible (MinIO for self-host, any S3 otherwise). +Methods are async wrappers so the service layer stays non-blocking even though +the underlying SDK (boto3) is synchronous. +""" + +from abc import ABC, abstractmethod + + +class ObjectStore(ABC): + @abstractmethod + async def ensure_bucket(self) -> None: ... + + @abstractmethod + async def put_object(self, *, key: str, data: bytes, content_type: str) -> None: ... + + @abstractmethod + async def presigned_get_url(self, *, key: str) -> str: ... + + @abstractmethod + async def delete_object(self, *, key: str) -> None: ... diff --git a/backend/app/integrations/objectstore/s3.py b/backend/app/integrations/objectstore/s3.py new file mode 100644 index 0000000..033bb02 --- /dev/null +++ b/backend/app/integrations/objectstore/s3.py @@ -0,0 +1,56 @@ +"""S3-compatible ObjectStore (boto3), suitable for MinIO or any S3 provider. + +boto3 is synchronous; each call is dispatched to a thread so request handlers +and the worker stay async.""" + +import asyncio + +import boto3 +from botocore.client import Config +from botocore.exceptions import ClientError + +from app.core.config import Settings +from app.integrations.objectstore.base import ObjectStore + + +class S3ObjectStore(ObjectStore): + def __init__(self, settings: Settings) -> None: + self.bucket = settings.s3_bucket + self.presign_ttl = settings.s3_presign_ttl + self._client = boto3.client( + "s3", + endpoint_url=settings.s3_endpoint_url, + aws_access_key_id=settings.s3_access_key, + aws_secret_access_key=settings.s3_secret_key, + region_name=settings.s3_region, + config=Config(signature_version="s3v4"), + ) + + def _ensure_bucket_sync(self) -> None: + try: + self._client.head_bucket(Bucket=self.bucket) + except ClientError: + self._client.create_bucket(Bucket=self.bucket) + + async def ensure_bucket(self) -> None: + await asyncio.to_thread(self._ensure_bucket_sync) + + async def put_object(self, *, key: str, data: bytes, content_type: str) -> None: + await asyncio.to_thread( + self._client.put_object, + Bucket=self.bucket, + Key=key, + Body=data, + ContentType=content_type, + ) + + async def presigned_get_url(self, *, key: str) -> str: + return await asyncio.to_thread( + self._client.generate_presigned_url, + "get_object", + Params={"Bucket": self.bucket, "Key": key}, + ExpiresIn=self.presign_ttl, + ) + + async def delete_object(self, *, key: str) -> None: + await asyncio.to_thread(self._client.delete_object, Bucket=self.bucket, Key=key) diff --git a/backend/app/models/__init__.py b/backend/app/models/__init__.py index dcc886e..9d17967 100644 --- a/backend/app/models/__init__.py +++ b/backend/app/models/__init__.py @@ -5,6 +5,7 @@ from app.models.audit import AuditEntry from app.models.auth import Session, UserToken from app.models.base import Base from app.models.event import Event +from app.models.media import Media from app.models.person import Name, Person from app.models.place import Place, PlaceName from app.models.relationship import Relationship @@ -28,4 +29,5 @@ __all__ = [ "AuditEntry", "Session", "UserToken", + "Media", ] diff --git a/backend/app/models/media.py b/backend/app/models/media.py new file mode 100644 index 0000000..c3a146a --- /dev/null +++ b/backend/app/models/media.py @@ -0,0 +1,36 @@ +"""Media — a binary asset (image, scan, PDF, audio) in object storage. The row +holds metadata + checksum + the storage key; the bytes live in the ObjectStore. +Optionally attached to a single fact (person, event, or source) for now.""" + +import uuid + +from sqlalchemy import BigInteger, ForeignKey, String +from sqlalchemy.orm import Mapped, mapped_column + +from app.models.base import Base +from app.models.mixins import SoftDelete, TenantScoped, Timestamps, UUIDPrimaryKey + + +class Media(Base, UUIDPrimaryKey, TenantScoped, Timestamps, SoftDelete): + __tablename__ = "media" + + uploader_id: Mapped[uuid.UUID | None] = mapped_column( + ForeignKey("users.id", ondelete="SET NULL"), index=True + ) + storage_key: Mapped[str] = mapped_column(String(512), unique=True) + original_filename: Mapped[str] = mapped_column(String(512)) + content_type: Mapped[str] = mapped_column(String(128)) + byte_size: Mapped[int] = mapped_column(BigInteger) + checksum_sha256: Mapped[str] = mapped_column(String(64), index=True) + title: Mapped[str | None] = mapped_column(String(512)) + + # Optional single attachment target. + person_id: Mapped[uuid.UUID | None] = mapped_column( + ForeignKey("persons.id", ondelete="SET NULL"), index=True + ) + event_id: Mapped[uuid.UUID | None] = mapped_column( + ForeignKey("events.id", ondelete="SET NULL"), index=True + ) + source_id: Mapped[uuid.UUID | None] = mapped_column( + ForeignKey("sources.id", ondelete="SET NULL"), index=True + ) diff --git a/backend/app/schemas/media.py b/backend/app/schemas/media.py new file mode 100644 index 0000000..e7dff40 --- /dev/null +++ b/backend/app/schemas/media.py @@ -0,0 +1,22 @@ +import uuid +from datetime import datetime + +from pydantic import BaseModel, ConfigDict + + +class MediaRead(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: uuid.UUID + tree_id: uuid.UUID + original_filename: str + content_type: str + byte_size: int + checksum_sha256: str + title: str | None + person_id: uuid.UUID | None + event_id: uuid.UUID | None + source_id: uuid.UUID | None + created_at: datetime + # Presigned download URL, filled in by the router from the ObjectStore. + url: str | None = None diff --git a/backend/app/services/media_service.py b/backend/app/services/media_service.py new file mode 100644 index 0000000..6b03a9c --- /dev/null +++ b/backend/app/services/media_service.py @@ -0,0 +1,107 @@ +"""Media service. Bytes go to the ObjectStore; a metadata row goes to the DB. +Writes require editor rights; reads go through the privacy engine.""" + +import hashlib +import uuid +from datetime import UTC, datetime + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.integrations.objectstore.base import ObjectStore +from app.models.media import Media +from app.models.tree import Tree +from app.models.user import User +from app.services import privacy +from app.services.audit import record_audit +from app.services.exceptions import Forbidden, NotFound + + +async def upload_media( + session: AsyncSession, + store: ObjectStore, + *, + actor: User, + tree: Tree, + data: bytes, + filename: str, + content_type: str, + title: str | None = None, + person_id: uuid.UUID | None = None, + event_id: uuid.UUID | None = None, + source_id: uuid.UUID | None = None, +) -> Media: + if not await privacy.can_edit_tree(session, user_id=actor.id, tree=tree): + raise Forbidden("not an editor of this tree") + + media_id = uuid.uuid4() + key = f"{tree.id}/{media_id}/{filename}" + await store.ensure_bucket() + await store.put_object(key=key, data=data, content_type=content_type) + + media = Media( + id=media_id, + tree_id=tree.id, + uploader_id=actor.id, + storage_key=key, + original_filename=filename, + content_type=content_type, + byte_size=len(data), + checksum_sha256=hashlib.sha256(data).hexdigest(), + title=title, + person_id=person_id, + event_id=event_id, + source_id=source_id, + ) + session.add(media) + await session.flush() + record_audit( + session, + action="create", + entity_type="Media", + entity_id=media.id, + tree_id=tree.id, + actor_user_id=actor.id, + after={"filename": filename, "bytes": len(data)}, + ) + await session.commit() + await session.refresh(media) + return media + + +async def list_media(session: AsyncSession, *, viewer_id: uuid.UUID, tree: Tree) -> list[Media]: + if not await privacy.can_view_tree(session, user_id=viewer_id, tree=tree): + raise Forbidden("not permitted to view this tree") + stmt = ( + select(Media) + .where(Media.tree_id == tree.id, Media.deleted_at.is_(None)) + .order_by(Media.created_at.desc()) + ) + return list((await session.execute(stmt)).scalars().all()) + + +async def delete_media( + session: AsyncSession, *, actor: User, tree: Tree, media_id: uuid.UUID +) -> None: + if not await privacy.can_edit_tree(session, user_id=actor.id, tree=tree): + raise Forbidden("not an editor of this tree") + media = ( + await session.execute( + select(Media).where( + Media.id == media_id, Media.tree_id == tree.id, Media.deleted_at.is_(None) + ) + ) + ).scalar_one_or_none() + if media is None: + raise NotFound("media not found") + # Soft delete the row; the object is removed by the worker's purge job. + media.deleted_at = datetime.now(UTC) + record_audit( + session, + action="delete", + entity_type="Media", + entity_id=media.id, + tree_id=tree.id, + actor_user_id=actor.id, + ) + await session.commit() diff --git a/backend/app/worker.py b/backend/app/worker.py new file mode 100644 index 0000000..132d719 --- /dev/null +++ b/backend/app/worker.py @@ -0,0 +1,103 @@ +"""Background worker. Same image as the backend, run in worker mode +(`python -m app.worker`). First job: the scheduled soft-delete purge — hard- +delete rows whose ``deleted_at`` is older than the recovery window, and remove +their objects from storage. More jobs (media processing, scraping, hints) and a +proper queue arrive in later phases. +""" + +import asyncio +import logging +import sys +from datetime import UTC, datetime, timedelta + +from sqlalchemy import delete, select + +from app.core.config import get_settings +from app.core.db import get_sessionmaker +from app.integrations.objectstore.s3 import S3ObjectStore +from app.models import ( + Citation, + Event, + Media, + Name, + Person, + Place, + PlaceName, + Relationship, + Source, + Tree, + User, +) + +logger = logging.getLogger("provenance.worker") + +# Child -> parent so foreign keys are satisfied as rows are removed. +_PURGE_ORDER = [Citation, Name, Event, Relationship, PlaceName, Place, Source, Person, Tree, User] + + +async def _purge_media(sessionmaker, store, cutoff: datetime) -> None: + async with sessionmaker() as session: + rows = ( + await session.execute( + select(Media).where(Media.deleted_at.is_not(None), Media.deleted_at < cutoff) + ) + ).scalars().all() + for media in rows: + try: + await store.delete_object(key=media.storage_key) + except Exception as exc: # noqa: BLE001 + logger.warning("object delete failed for %s: %s", media.storage_key, exc) + await session.delete(media) + await session.commit() + if rows: + logger.info("purged %d media", len(rows)) + + +async def _purge_table(sessionmaker, model, cutoff: datetime) -> None: + async with sessionmaker() as session: + try: + res = await session.execute( + delete(model).where(model.deleted_at.is_not(None), model.deleted_at < cutoff) + ) + await session.commit() + if res.rowcount: + logger.info("purged %d %s", res.rowcount, model.__tablename__) + except Exception as exc: # noqa: BLE001 + await session.rollback() + logger.warning("purge %s failed: %s", model.__tablename__, exc) + + +async def purge_once(sessionmaker, store) -> None: + settings = get_settings() + cutoff = datetime.now(UTC) - timedelta(days=settings.purge_after_days) + await _purge_media(sessionmaker, store, cutoff) + for model in _PURGE_ORDER: + await _purge_table(sessionmaker, model, cutoff) + + +async def main() -> None: + logging.basicConfig( + level=logging.INFO, format="%(levelname)s [%(name)s] %(message)s", stream=sys.stdout + ) + settings = get_settings() + store = S3ObjectStore(settings) + try: + await store.ensure_bucket() + except Exception as exc: # noqa: BLE001 + logger.warning("ensure_bucket failed: %s", exc) + sessionmaker = get_sessionmaker() + logger.info( + "worker started; purge every %ds (recovery window %dd)", + settings.purge_interval_seconds, + settings.purge_after_days, + ) + while True: + try: + await purge_once(sessionmaker, store) + except Exception as exc: # noqa: BLE001 + logger.warning("purge cycle error: %s", exc) + await asyncio.sleep(settings.purge_interval_seconds) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/backend/migrations/versions/7fc7024ef432_media.py b/backend/migrations/versions/7fc7024ef432_media.py new file mode 100644 index 0000000..5656f7f --- /dev/null +++ b/backend/migrations/versions/7fc7024ef432_media.py @@ -0,0 +1,65 @@ +"""media + +Revision ID: 7fc7024ef432 +Revises: 1f6e54f6406a +Create Date: 2026-06-06 21:44:03.204170 + +""" +from collections.abc import Sequence + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '7fc7024ef432' +down_revision: str | None = '1f6e54f6406a' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('media', + sa.Column('uploader_id', sa.Uuid(), nullable=True), + sa.Column('storage_key', sa.String(length=512), nullable=False), + sa.Column('original_filename', sa.String(length=512), nullable=False), + sa.Column('content_type', sa.String(length=128), nullable=False), + sa.Column('byte_size', sa.BigInteger(), nullable=False), + sa.Column('checksum_sha256', sa.String(length=64), nullable=False), + sa.Column('title', sa.String(length=512), nullable=True), + sa.Column('person_id', sa.Uuid(), nullable=True), + sa.Column('event_id', sa.Uuid(), nullable=True), + sa.Column('source_id', sa.Uuid(), nullable=True), + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('tree_id', sa.Uuid(), nullable=False), + sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), + sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), + sa.Column('deleted_at', sa.DateTime(timezone=True), nullable=True), + sa.ForeignKeyConstraint(['event_id'], ['events.id'], name=op.f('fk_media_event_id_events'), ondelete='SET NULL'), + sa.ForeignKeyConstraint(['person_id'], ['persons.id'], name=op.f('fk_media_person_id_persons'), ondelete='SET NULL'), + sa.ForeignKeyConstraint(['source_id'], ['sources.id'], name=op.f('fk_media_source_id_sources'), ondelete='SET NULL'), + sa.ForeignKeyConstraint(['tree_id'], ['trees.id'], name=op.f('fk_media_tree_id_trees'), ondelete='CASCADE'), + sa.ForeignKeyConstraint(['uploader_id'], ['users.id'], name=op.f('fk_media_uploader_id_users'), ondelete='SET NULL'), + sa.PrimaryKeyConstraint('id', name=op.f('pk_media')), + sa.UniqueConstraint('storage_key', name=op.f('uq_media_storage_key')) + ) + op.create_index(op.f('ix_media_checksum_sha256'), 'media', ['checksum_sha256'], unique=False) + op.create_index(op.f('ix_media_event_id'), 'media', ['event_id'], unique=False) + op.create_index(op.f('ix_media_person_id'), 'media', ['person_id'], unique=False) + op.create_index(op.f('ix_media_source_id'), 'media', ['source_id'], unique=False) + op.create_index(op.f('ix_media_tree_id'), 'media', ['tree_id'], unique=False) + op.create_index(op.f('ix_media_uploader_id'), 'media', ['uploader_id'], unique=False) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f('ix_media_uploader_id'), table_name='media') + op.drop_index(op.f('ix_media_tree_id'), table_name='media') + op.drop_index(op.f('ix_media_source_id'), table_name='media') + op.drop_index(op.f('ix_media_person_id'), table_name='media') + op.drop_index(op.f('ix_media_event_id'), table_name='media') + op.drop_index(op.f('ix_media_checksum_sha256'), table_name='media') + op.drop_table('media') + # ### end Alembic commands ### diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 7092026..0fabe8d 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -12,6 +12,8 @@ dependencies = [ "asyncpg>=0.30", "alembic>=1.14", "argon2-cffi>=23.1", + "boto3>=1.35", + "python-multipart>=0.0.12", ] [dependency-groups] @@ -36,6 +38,10 @@ extend-exclude = ["migrations/versions"] [tool.ruff.lint] select = ["E", "F", "I", "UP", "B"] +[tool.ruff.lint.flake8-bugbear] +# FastAPI uses these as call-expressions in argument defaults by design. +extend-immutable-calls = ["fastapi.File", "fastapi.Form", "fastapi.Depends", "fastapi.Query", "fastapi.Header"] + [tool.pytest.ini_options] asyncio_mode = "auto" pythonpath = ["."] diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py index 1682de0..78712d7 100644 --- a/backend/tests/conftest.py +++ b/backend/tests/conftest.py @@ -14,9 +14,10 @@ from httpx import ASGITransport, AsyncClient from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine import app.models # noqa: F401 — register all models on Base.metadata -from app.api.deps import get_mailer +from app.api.deps import get_mailer, get_objectstore from app.core.db import get_session from app.integrations.mailer.base import Mailer +from app.integrations.objectstore.base import ObjectStore from app.main import app from app.models import Base @@ -35,7 +36,25 @@ class CapturingMailer(Mailer): self.resets.append((to, link)) +class FakeObjectStore(ObjectStore): + def __init__(self) -> None: + self.objects: dict[str, tuple[bytes, str]] = {} + + async def ensure_bucket(self) -> None: + pass + + async def put_object(self, *, key: str, data: bytes, content_type: str) -> None: + self.objects[key] = (data, content_type) + + async def presigned_get_url(self, *, key: str) -> str: + return f"https://objects.test/{key}" + + async def delete_object(self, *, key: str) -> None: + self.objects.pop(key, None) + + _mailer = CapturingMailer() +_store = FakeObjectStore() @pytest.fixture @@ -61,8 +80,10 @@ async def client(): _mailer.verifications.clear() _mailer.resets.clear() + _store.objects.clear() app.dependency_overrides[get_session] = _override_session app.dependency_overrides[get_mailer] = lambda: _mailer + app.dependency_overrides[get_objectstore] = lambda: _store transport = ASGITransport(app=app) async with AsyncClient(transport=transport, base_url="http://test") as http_client: diff --git a/backend/tests/test_media.py b/backend/tests/test_media.py new file mode 100644 index 0000000..0b488dd --- /dev/null +++ b/backend/tests/test_media.py @@ -0,0 +1,45 @@ +"""Media upload/list/delete through the API (object store faked in conftest).""" + +from tests.conftest import auth, register + + +async def _tree(client, email): + h = auth(await register(client, email)) + tree_id = (await client.post("/api/v1/trees", json={"name": "M"}, headers=h)).json()["id"] + return h, tree_id + + +async def test_media_upload_list_delete(client): + h, tree_id = await _tree(client, "media1@example.com") + + resp = await client.post( + f"/api/v1/trees/{tree_id}/media", + files={"file": ("scan.txt", b"hello world", "text/plain")}, + data={"title": "A scan"}, + headers=h, + ) + assert resp.status_code == 201, resp.text + body = resp.json() + assert body["original_filename"] == "scan.txt" + assert body["byte_size"] == 11 + assert body["url"].startswith("https://objects.test/") + media_id = body["id"] + + listed = await client.get(f"/api/v1/trees/{tree_id}/media", headers=h) + assert listed.status_code == 200 + assert len(listed.json()) == 1 + + resp = await client.delete(f"/api/v1/trees/{tree_id}/media/{media_id}", headers=h) + assert resp.status_code == 204 + assert len((await client.get(f"/api/v1/trees/{tree_id}/media", headers=h)).json()) == 0 + + +async def test_non_member_cannot_upload(client): + h, tree_id = await _tree(client, "media2@example.com") + other = auth(await register(client, "media-intruder@example.com")) + resp = await client.post( + f"/api/v1/trees/{tree_id}/media", + files={"file": ("x.txt", b"x", "text/plain")}, + headers=other, + ) + assert resp.status_code == 403 diff --git a/backend/uv.lock b/backend/uv.lock index 9c5d5de..5036127 100644 --- a/backend/uv.lock +++ b/backend/uv.lock @@ -125,6 +125,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3c/d7/8fb3044eaef08a310acfe23dae9a8e2e07d305edc29a53497e52bc76eca7/asyncpg-0.31.0-cp314-cp314t-win_amd64.whl", hash = "sha256:bd4107bb7cdd0e9e65fae66a62afd3a249663b844fa34d479f6d5b3bef9c04c3", size = 706062, upload-time = "2025-11-24T23:26:44.086Z" }, ] +[[package]] +name = "boto3" +version = "1.43.24" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "botocore" }, + { name = "jmespath" }, + { name = "s3transfer" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f3/8f/94dfa39ec618ecb2fe5b5b79428c95100e3ae3c1aa5083c283dd3cfb5ecd/boto3-1.43.24.tar.gz", hash = "sha256:ba5afa266bf7265e0c1a454fcfd48bffe5939cb16ed223bebc669c3dc8ee0bc8", size = 113154, upload-time = "2026-06-05T19:30:01.635Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/59/b7/e66c9b37b96153aa371fe48d24194151293f6577dd3eaa1fc146c281456d/boto3-1.43.24-py3-none-any.whl", hash = "sha256:b18ef745274ef548a9660d733d985d4a971b16bd8a6af88165ea9d0e40913b86", size = 140536, upload-time = "2026-06-05T19:29:58.968Z" }, +] + +[[package]] +name = "botocore" +version = "1.43.24" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jmespath" }, + { name = "python-dateutil" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/78/67/55d0611b341482bc9649d16df765f849a1862184ac3709356decf632279f/botocore-1.43.24.tar.gz", hash = "sha256:0c02f2b40e99419d496ece0ea2dcdedb5c45998c16fd1674276c7dbb30767a16", size = 15471690, upload-time = "2026-06-05T19:29:33.731Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c9/b7/360b5afe74c4d7cff871ea6e8f335e2e11de2945c9deb1eea6438f49faa2/botocore-1.43.24-py3-none-any.whl", hash = "sha256:42903b4bfafd8f15a735ed940473f28e4ba21b2ea67a9b9aaa11dfa7fcb19fd5", size = 15155182, upload-time = "2026-06-05T19:29:29.457Z" }, +] + [[package]] name = "certifi" version = "2026.5.20" @@ -357,6 +385,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, ] +[[package]] +name = "jmespath" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/59/322338183ecda247fb5d1763a6cbe46eff7222eaeebafd9fa65d4bf5cb11/jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d", size = 27377, upload-time = "2026-01-22T16:35:26.279Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" }, +] + [[package]] name = "mako" version = "1.3.12" @@ -447,9 +484,11 @@ dependencies = [ { name = "alembic" }, { name = "argon2-cffi" }, { name = "asyncpg" }, + { name = "boto3" }, { name = "fastapi" }, { name = "pydantic" }, { name = "pydantic-settings" }, + { name = "python-multipart" }, { name = "sqlalchemy", extra = ["asyncio"] }, { name = "uvicorn", extra = ["standard"] }, ] @@ -467,9 +506,11 @@ requires-dist = [ { name = "alembic", specifier = ">=1.14" }, { name = "argon2-cffi", specifier = ">=23.1" }, { name = "asyncpg", specifier = ">=0.30" }, + { name = "boto3", specifier = ">=1.35" }, { name = "fastapi", specifier = ">=0.115" }, { name = "pydantic", specifier = ">=2.9" }, { name = "pydantic-settings", specifier = ">=2.5" }, + { name = "python-multipart", specifier = ">=0.0.12" }, { name = "sqlalchemy", extras = ["asyncio"], specifier = ">=2.0" }, { name = "uvicorn", extras = ["standard"], specifier = ">=0.34" }, ] @@ -613,6 +654,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/03/e2/08a497ef684b88559c9cc5f4ad53a37e7b99e727094a86d6ea32536d5d3c/pytest_asyncio-1.4.0-py3-none-any.whl", hash = "sha256:933ca923a23075a87fb7070c0ec272a6848489824d887c85c812670932835aa1", size = 16930, upload-time = "2026-05-26T09:56:02.576Z" }, ] +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, +] + [[package]] name = "python-dotenv" version = "1.2.2" @@ -622,6 +675,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101, upload-time = "2026-03-01T16:00:25.09Z" }, ] +[[package]] +name = "python-multipart" +version = "0.0.32" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5b/42/55c32bb9b12693c092ad250a0e82edb5b31ddeda6eb772de5f308b3804ad/python_multipart-0.0.32.tar.gz", hash = "sha256:be54b7f3fa167bb83e4fcd936b887b708f4e57fe75911c02aebf53efaf8d938e", size = 46881, upload-time = "2026-06-04T16:18:58.647Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e1/04/e8135ebd1ad02c56ec633277529b2602ff99ff634be76cdba5744cf554fd/python_multipart-0.0.32-py3-none-any.whl", hash = "sha256:ff6d3f776f16878c894e52e107296ffc890e913c611b1a4ec6c44e2821fe2e23", size = 30042, upload-time = "2026-06-04T16:18:57.319Z" }, +] + [[package]] name = "pyyaml" version = "6.0.3" @@ -683,6 +745,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/15/19/016553f86f207450aebebc2b2b5088d086b901cc8186c02ac4284db3bd88/ruff-0.15.16-py3-none-win_arm64.whl", hash = "sha256:8cd61783afb39638a7133ef0d2dfb1e91277593962f81b5a8423eb0b888a6121", size = 11134555, upload-time = "2026-06-04T16:33:00.136Z" }, ] +[[package]] +name = "s3transfer" +version = "0.18.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "botocore" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e0/1f/12417f7f493fc45e1f9fd5d4a9b6c125cf8d2cf3f8ddbdfab3e76406e9d6/s3transfer-0.18.0.tar.gz", hash = "sha256:3760b8b7ec1315da54048b2d626276732bee4300d054d492d4e1d43e20d4ecbd", size = 160560, upload-time = "2026-05-28T19:39:09.124Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2b/58/a58fc997655386daa2e25784e30c288aa3e3819e401f77029ee4899fb55a/s3transfer-0.18.0-py3-none-any.whl", hash = "sha256:239c13b09e65ad0346e1be7348b8a202dcad44ac7ea7c6eb858fc881dce739b6", size = 88572, upload-time = "2026-05-28T19:39:07.999Z" }, +] + +[[package]] +name = "six" +version = "1.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, +] + [[package]] name = "sqlalchemy" version = "2.0.50" @@ -755,6 +838,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" }, ] +[[package]] +name = "urllib3" +version = "2.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/53/0c/06f8b233b8fd13b9e5ee11424ef85419ba0d8ba0b3138bf360be2ff56953/urllib3-2.7.0.tar.gz", hash = "sha256:231e0ec3b63ceb14667c67be60f2f2c40a518cb38b03af60abc813da26505f4c", size = 433602, upload-time = "2026-05-07T16:13:18.596Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/3e/5db95bcf282c52709639744ca2a8b149baccf648e39c8cc87553df9eae0c/urllib3-2.7.0-py3-none-any.whl", hash = "sha256:9fb4c81ebbb1ce9531cce37674bbc6f1360472bc18ca9a553ede278ef7276897", size = 131087, upload-time = "2026-05-07T16:13:17.151Z" }, +] + [[package]] name = "uvicorn" version = "0.49.0" diff --git a/deploy/docker-compose.dev.yml b/deploy/docker-compose.dev.yml index c459e4e..16adb50 100644 --- a/deploy/docker-compose.dev.yml +++ b/deploy/docker-compose.dev.yml @@ -12,6 +12,11 @@ services: context: ../backend dockerfile: Dockerfile + worker: + build: + context: ../backend + dockerfile: Dockerfile + frontend: build: context: ../frontend diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml index c494d90..3fdbd75 100644 --- a/deploy/docker-compose.yml +++ b/deploy/docker-compose.yml @@ -47,9 +47,16 @@ services: environment: APP_ENV: ${APP_ENV:-development} DATABASE_URL: ${DATABASE_URL:-postgresql+asyncpg://provenance:provenance@postgres:5432/provenance} + S3_ENDPOINT_URL: ${S3_ENDPOINT_URL:-http://minio:9000} + S3_BUCKET: ${S3_BUCKET:-provenance} + S3_ACCESS_KEY: ${S3_ACCESS_KEY:-provenance} + S3_SECRET_KEY: ${S3_SECRET_KEY:-change-me-too} + S3_REGION: ${S3_REGION:-us-east-1} depends_on: postgres: condition: service_healthy + minio: + condition: service_healthy healthcheck: test: - CMD-SHELL @@ -62,6 +69,28 @@ services: start_period: 20s restart: unless-stopped + # Background worker — same image as the backend, run in worker mode. + # First job: the scheduled soft-delete purge (and media object cleanup). + worker: + image: git.jpaul.io/justin/provenance-backend:${IMAGE_TAG:-test-main} + command: ["uv", "run", "--no-dev", "python", "-m", "app.worker"] + labels: + com.centurylinklabs.watchtower.enable: "true" + environment: + APP_ENV: ${APP_ENV:-development} + DATABASE_URL: ${DATABASE_URL:-postgresql+asyncpg://provenance:provenance@postgres:5432/provenance} + S3_ENDPOINT_URL: ${S3_ENDPOINT_URL:-http://minio:9000} + S3_BUCKET: ${S3_BUCKET:-provenance} + S3_ACCESS_KEY: ${S3_ACCESS_KEY:-provenance} + S3_SECRET_KEY: ${S3_SECRET_KEY:-change-me-too} + S3_REGION: ${S3_REGION:-us-east-1} + depends_on: + postgres: + condition: service_healthy + minio: + condition: service_healthy + restart: unless-stopped + frontend: image: git.jpaul.io/justin/provenance-frontend:${IMAGE_TAG:-test-main} labels: