Dateiverwaltung Email attachment abruf läuft
This commit is contained in:
parent
87daee85f9
commit
6c218642a2
503 changed files with 147127 additions and 0 deletions
15
.env.example
Normal file
15
.env.example
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
# Dateiverwaltung Umgebungsvariablen
|
||||||
|
# Kopiere diese Datei nach .env und passe sie an
|
||||||
|
|
||||||
|
# Datenbank
|
||||||
|
DATABASE_URL=sqlite:///./data/dateiverwaltung.db
|
||||||
|
|
||||||
|
# Zeitzone
|
||||||
|
TZ=Europe/Berlin
|
||||||
|
|
||||||
|
# OCR Einstellungen
|
||||||
|
OCR_LANGUAGE=deu
|
||||||
|
OCR_DPI=300
|
||||||
|
|
||||||
|
# Optional: Claude API für KI-Validierung (spätere Erweiterung)
|
||||||
|
# CLAUDE_API_KEY=sk-ant-...
|
||||||
42
Dockerfile
Normal file
42
Dockerfile
Normal file
|
|
@ -0,0 +1,42 @@
|
||||||
|
# Dateiverwaltung Docker Image
|
||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
# System-Abhängigkeiten für OCR und PDF
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
tesseract-ocr \
|
||||||
|
tesseract-ocr-deu \
|
||||||
|
ocrmypdf \
|
||||||
|
poppler-utils \
|
||||||
|
ghostscript \
|
||||||
|
libmagic1 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Arbeitsverzeichnis
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Python-Abhängigkeiten
|
||||||
|
COPY backend/requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Anwendung kopieren
|
||||||
|
COPY backend/ ./backend/
|
||||||
|
COPY frontend/ ./frontend/
|
||||||
|
COPY config/ ./config/
|
||||||
|
COPY regeln/ ./regeln/
|
||||||
|
|
||||||
|
# Daten-Verzeichnis
|
||||||
|
RUN mkdir -p /app/data/inbox /app/data/processed /app/data/archive /app/data/zugferd
|
||||||
|
|
||||||
|
# Umgebungsvariablen
|
||||||
|
ENV PYTHONPATH=/app
|
||||||
|
ENV DATABASE_URL=sqlite:////app/data/dateiverwaltung.db
|
||||||
|
|
||||||
|
# Port
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
# Health Check
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --retries=3 \
|
||||||
|
CMD curl -f http://localhost:8000/health || exit 1
|
||||||
|
|
||||||
|
# Start
|
||||||
|
CMD ["uvicorn", "backend.app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
147
README.md
147
README.md
|
|
@ -1,2 +1,149 @@
|
||||||
|
<<<<<<< HEAD
|
||||||
# docker.dateiverwaltung
|
# docker.dateiverwaltung
|
||||||
|
|
||||||
|
=======
|
||||||
|
# Dateiverwaltung
|
||||||
|
|
||||||
|
Modulares Dokumenten-Management-System für automatische Verarbeitung, Sortierung und Benennung von Dokumenten.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **Mail-Abruf**: Automatischer Abruf von Attachments aus IMAP-Postfächern
|
||||||
|
- **PDF-Verarbeitung**: Text-Extraktion und OCR für gescannte Dokumente
|
||||||
|
- **ZUGFeRD-Erkennung**: Automatische Erkennung und separate Ablage von ZUGFeRD-Rechnungen
|
||||||
|
- **Regel-Engine**: Flexible, erweiterbare Regeln für Erkennung und Benennung
|
||||||
|
- **Pipeline-System**: Mehrere unabhängige Pipelines (Firma, Privat, etc.)
|
||||||
|
|
||||||
|
## Schnellstart
|
||||||
|
|
||||||
|
### Mit Docker (empfohlen)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Image bauen und starten
|
||||||
|
docker-compose up -d
|
||||||
|
|
||||||
|
# Logs ansehen
|
||||||
|
docker-compose logs -f
|
||||||
|
|
||||||
|
# Stoppen
|
||||||
|
docker-compose down
|
||||||
|
```
|
||||||
|
|
||||||
|
Dann im Browser öffnen: http://localhost:8000
|
||||||
|
|
||||||
|
### Ohne Docker
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Virtuelle Umgebung erstellen
|
||||||
|
cd backend
|
||||||
|
python -m venv venv
|
||||||
|
source venv/bin/activate # Linux/Mac
|
||||||
|
# oder: venv\Scripts\activate # Windows
|
||||||
|
|
||||||
|
# Abhängigkeiten installieren
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
# Starten
|
||||||
|
uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
## Benennungsschema
|
||||||
|
|
||||||
|
### Wiederkehrende Dokumente (Rechnungen)
|
||||||
|
```
|
||||||
|
{Jahr}.{Monat}.{Tag} - {Kategorie} - {Ersteller} - {Dokumentennummer} - {Sammelbegriff} - {Preis} EUR.pdf
|
||||||
|
|
||||||
|
Beispiel:
|
||||||
|
2026.02.01 - Rechnung - Sonepar - 10023934 - Material - 1600 EUR.pdf
|
||||||
|
```
|
||||||
|
|
||||||
|
### Einmalige Dokumente (Verträge, Zeugnisse)
|
||||||
|
```
|
||||||
|
{Typ} - {Aussteller} - {Beschreibung} - {Jahr}.pdf
|
||||||
|
|
||||||
|
Beispiel:
|
||||||
|
Zeugnis - Schule X - Grundschulzeugnis - 2026.pdf
|
||||||
|
```
|
||||||
|
|
||||||
|
## Projektstruktur
|
||||||
|
|
||||||
|
```
|
||||||
|
dateiverwaltung/
|
||||||
|
├── backend/
|
||||||
|
│ ├── app/
|
||||||
|
│ │ ├── models/ # Datenbank-Modelle
|
||||||
|
│ │ ├── modules/ # Kernmodule (Mail, PDF, Sorter)
|
||||||
|
│ │ ├── routes/ # API Endpoints
|
||||||
|
│ │ ├── services/ # Business Logic
|
||||||
|
│ │ └── main.py # FastAPI App
|
||||||
|
│ └── requirements.txt
|
||||||
|
├── frontend/
|
||||||
|
│ ├── static/
|
||||||
|
│ │ ├── css/
|
||||||
|
│ │ └── js/
|
||||||
|
│ └── templates/
|
||||||
|
├── data/ # Persistente Daten
|
||||||
|
│ ├── inbox/ # Neue Dateien
|
||||||
|
│ ├── processed/ # Verarbeitete Dateien
|
||||||
|
│ ├── archive/ # Sortierte Dateien
|
||||||
|
│ └── zugferd/ # ZUGFeRD-Rechnungen
|
||||||
|
├── regeln/ # Regel-Beispiele
|
||||||
|
├── docker-compose.yml
|
||||||
|
├── Dockerfile
|
||||||
|
└── README.md
|
||||||
|
```
|
||||||
|
|
||||||
|
## Module
|
||||||
|
|
||||||
|
### Mail-Fetcher
|
||||||
|
Holt Attachments aus IMAP-Postfächern mit konfigurierbaren Filtern:
|
||||||
|
- Dateitypen (.pdf, .jpg, etc.)
|
||||||
|
- Maximale Größe
|
||||||
|
- IMAP-Ordner
|
||||||
|
|
||||||
|
### PDF-Processor
|
||||||
|
- **Text-Extraktion**: Mit pdfplumber/pypdf
|
||||||
|
- **OCR**: Mit ocrmypdf + Tesseract (deutsch)
|
||||||
|
- **ZUGFeRD**: Erkennung via factur-x Library
|
||||||
|
|
||||||
|
### Sorter
|
||||||
|
Regelbasierte Erkennung und Benennung:
|
||||||
|
- Pattern-Matching (Text, Absender, Dateiname)
|
||||||
|
- Regex-basierte Feldextraktion
|
||||||
|
- Konfigurierbares Namensschema
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
| Methode | Endpoint | Beschreibung |
|
||||||
|
|---------|----------|--------------|
|
||||||
|
| GET | /api/pipelines | Alle Pipelines |
|
||||||
|
| POST | /api/pipelines | Neue Pipeline |
|
||||||
|
| POST | /api/pipelines/{id}/run | Pipeline ausführen |
|
||||||
|
| GET | /api/pipelines/{id}/mail-configs | Mail-Konfigurationen |
|
||||||
|
| POST | /api/pipelines/{id}/mail-configs | Postfach hinzufügen |
|
||||||
|
| GET | /api/pipelines/{id}/regeln | Sortier-Regeln |
|
||||||
|
| POST | /api/pipelines/{id}/regeln | Regel hinzufügen |
|
||||||
|
| POST | /api/regeln/test | Regel testen |
|
||||||
|
| GET | /api/dokumente | Verarbeitete Dokumente |
|
||||||
|
| GET | /api/stats | Statistiken |
|
||||||
|
|
||||||
|
## Regex-Beispiele für Regeln
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Datum (DD.MM.YYYY)
|
||||||
|
(\d{2}[./]\d{2}[./]\d{4})
|
||||||
|
|
||||||
|
# Rechnungsnummer
|
||||||
|
(?:Rechnungsnummer|Invoice)[:\s]*(\d+)
|
||||||
|
|
||||||
|
# Betrag mit EUR
|
||||||
|
(?:Gesamtbetrag|Summe)[:\s]*([\d.,]+)\s*(?:EUR|€)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Erweiterungen (geplant)
|
||||||
|
|
||||||
|
- [ ] Claude API Integration für KI-Validierung
|
||||||
|
- [ ] Scheduler für automatische Ausführung
|
||||||
|
- [ ] Dolibarr-Integration
|
||||||
|
- [ ] Dashboard mit Grafiken
|
||||||
|
>>>>>>> 8585cc3 (Dateiverwaltung Email attachment abruf läuft)
|
||||||
|
|
|
||||||
1
backend/app/__init__.py
Normal file
1
backend/app/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
# Dateiverwaltung - Modulares Dokumenten-Management-System
|
||||||
BIN
backend/app/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
backend/app/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/__pycache__/config.cpython-313.pyc
Normal file
BIN
backend/app/__pycache__/config.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/__pycache__/main.cpython-313.pyc
Normal file
BIN
backend/app/__pycache__/main.cpython-313.pyc
Normal file
Binary file not shown.
26
backend/app/config.py
Normal file
26
backend/app/config.py
Normal file
|
|
@ -0,0 +1,26 @@
|
||||||
|
"""Zentrale Konfiguration"""
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Basis-Pfade
|
||||||
|
BASE_DIR = Path(__file__).parent.parent.parent
|
||||||
|
DATA_DIR = BASE_DIR / "data"
|
||||||
|
CONFIG_DIR = BASE_DIR / "config"
|
||||||
|
REGELN_DIR = BASE_DIR / "regeln"
|
||||||
|
|
||||||
|
# Datenbank
|
||||||
|
DATABASE_URL = os.getenv("DATABASE_URL", f"sqlite:///{DATA_DIR}/dateiverwaltung.db")
|
||||||
|
|
||||||
|
# Ordner-Struktur
|
||||||
|
INBOX_DIR = DATA_DIR / "inbox"
|
||||||
|
PROCESSED_DIR = DATA_DIR / "processed"
|
||||||
|
ARCHIVE_DIR = DATA_DIR / "archive"
|
||||||
|
ZUGFERD_DIR = DATA_DIR / "zugferd"
|
||||||
|
|
||||||
|
# OCR Einstellungen
|
||||||
|
OCR_LANGUAGE = "deu" # Deutsch
|
||||||
|
OCR_DPI = 300
|
||||||
|
|
||||||
|
# Erstelle Ordner falls nicht vorhanden
|
||||||
|
for dir_path in [INBOX_DIR, PROCESSED_DIR, ARCHIVE_DIR, ZUGFERD_DIR, REGELN_DIR]:
|
||||||
|
dir_path.mkdir(parents=True, exist_ok=True)
|
||||||
62
backend/app/main.py
Normal file
62
backend/app/main.py
Normal file
|
|
@ -0,0 +1,62 @@
|
||||||
|
"""
|
||||||
|
Dateiverwaltung - Modulares Dokumenten-Management-System
|
||||||
|
Hauptanwendung mit FastAPI
|
||||||
|
"""
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from fastapi.staticfiles import StaticFiles
|
||||||
|
from fastapi.templating import Jinja2Templates
|
||||||
|
from fastapi.responses import HTMLResponse
|
||||||
|
from fastapi import Request
|
||||||
|
from pathlib import Path
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from .models import init_db
|
||||||
|
from .routes.api import router as api_router
|
||||||
|
from .config import BASE_DIR
|
||||||
|
|
||||||
|
# Logging konfigurieren
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||||
|
)
|
||||||
|
|
||||||
|
# App erstellen
|
||||||
|
app = FastAPI(
|
||||||
|
title="Dateiverwaltung",
|
||||||
|
description="Modulares Dokumenten-Management-System",
|
||||||
|
version="1.0.0"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Statische Dateien
|
||||||
|
frontend_dir = BASE_DIR / "frontend"
|
||||||
|
app.mount("/static", StaticFiles(directory=frontend_dir / "static"), name="static")
|
||||||
|
|
||||||
|
# Templates
|
||||||
|
templates = Jinja2Templates(directory=frontend_dir / "templates")
|
||||||
|
|
||||||
|
# API Router
|
||||||
|
app.include_router(api_router)
|
||||||
|
|
||||||
|
|
||||||
|
@app.on_event("startup")
|
||||||
|
async def startup():
|
||||||
|
"""Initialisierung beim Start"""
|
||||||
|
init_db()
|
||||||
|
logging.info("Datenbank initialisiert")
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/", response_class=HTMLResponse)
|
||||||
|
async def index(request: Request):
|
||||||
|
"""Hauptseite"""
|
||||||
|
return templates.TemplateResponse("index.html", {"request": request})
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health():
|
||||||
|
"""Health Check für Docker"""
|
||||||
|
return {"status": "ok"}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||||
4
backend/app/models/__init__.py
Normal file
4
backend/app/models/__init__.py
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
from .database import (
|
||||||
|
Postfach, QuellOrdner, SortierRegel, VerarbeiteteDatei,
|
||||||
|
init_db, get_db, SessionLocal
|
||||||
|
)
|
||||||
BIN
backend/app/models/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
backend/app/models/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/models/__pycache__/database.cpython-313.pyc
Normal file
BIN
backend/app/models/__pycache__/database.cpython-313.pyc
Normal file
Binary file not shown.
161
backend/app/models/database.py
Normal file
161
backend/app/models/database.py
Normal file
|
|
@ -0,0 +1,161 @@
|
||||||
|
"""Datenbank-Modelle - Getrennte Bereiche: Mail-Abruf und Datei-Sortierung"""
|
||||||
|
from sqlalchemy import create_engine, Column, Integer, String, Boolean, DateTime, Text, JSON
|
||||||
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from ..config import DATABASE_URL
|
||||||
|
|
||||||
|
engine = create_engine(DATABASE_URL, echo=False)
|
||||||
|
SessionLocal = sessionmaker(bind=engine)
|
||||||
|
Base = declarative_base()
|
||||||
|
|
||||||
|
|
||||||
|
# ============ BEREICH 1: Mail-Abruf ============
|
||||||
|
|
||||||
|
class Postfach(Base):
|
||||||
|
"""IMAP-Postfach Konfiguration"""
|
||||||
|
__tablename__ = "postfaecher"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
name = Column(String(100), nullable=False)
|
||||||
|
|
||||||
|
# IMAP
|
||||||
|
imap_server = Column(String(255), nullable=False)
|
||||||
|
imap_port = Column(Integer, default=993)
|
||||||
|
email = Column(String(255), nullable=False)
|
||||||
|
passwort = Column(String(255), nullable=False)
|
||||||
|
ordner = Column(String(100), default="INBOX")
|
||||||
|
alle_ordner = Column(Boolean, default=False) # Alle IMAP-Ordner durchsuchen
|
||||||
|
nur_ungelesen = Column(Boolean, default=False) # Nur ungelesene Mails (False = alle)
|
||||||
|
|
||||||
|
# Ziel
|
||||||
|
ziel_ordner = Column(String(500), nullable=False)
|
||||||
|
|
||||||
|
# Filter
|
||||||
|
erlaubte_typen = Column(JSON, default=lambda: [".pdf"])
|
||||||
|
max_groesse_mb = Column(Integer, default=25)
|
||||||
|
|
||||||
|
# Status
|
||||||
|
aktiv = Column(Boolean, default=True)
|
||||||
|
letzter_abruf = Column(DateTime)
|
||||||
|
letzte_anzahl = Column(Integer, default=0)
|
||||||
|
|
||||||
|
|
||||||
|
# ============ BEREICH 2: Datei-Sortierung ============
|
||||||
|
|
||||||
|
class QuellOrdner(Base):
|
||||||
|
"""Ordner der nach Dateien gescannt wird"""
|
||||||
|
__tablename__ = "quell_ordner"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
name = Column(String(100), nullable=False)
|
||||||
|
pfad = Column(String(500), nullable=False)
|
||||||
|
ziel_ordner = Column(String(500), nullable=False)
|
||||||
|
rekursiv = Column(Boolean, default=True) # Unterordner einschließen
|
||||||
|
dateitypen = Column(JSON, default=lambda: [".pdf", ".jpg", ".jpeg", ".png", ".tiff"])
|
||||||
|
aktiv = Column(Boolean, default=True)
|
||||||
|
|
||||||
|
|
||||||
|
class SortierRegel(Base):
|
||||||
|
"""Regeln für Datei-Erkennung und Benennung"""
|
||||||
|
__tablename__ = "sortier_regeln"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
name = Column(String(100), nullable=False)
|
||||||
|
prioritaet = Column(Integer, default=100)
|
||||||
|
aktiv = Column(Boolean, default=True)
|
||||||
|
|
||||||
|
# Erkennungsmuster
|
||||||
|
muster = Column(JSON, default=dict)
|
||||||
|
|
||||||
|
# Extraktion
|
||||||
|
extraktion = Column(JSON, default=dict)
|
||||||
|
|
||||||
|
# Ausgabe
|
||||||
|
schema = Column(String(500), default="{datum} - Dokument.pdf")
|
||||||
|
unterordner = Column(String(100)) # Optional: Unterordner im Ziel
|
||||||
|
|
||||||
|
|
||||||
|
class VerarbeiteteMail(Base):
|
||||||
|
"""Tracking welche Mails bereits verarbeitet wurden"""
|
||||||
|
__tablename__ = "verarbeitete_mails"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
postfach_id = Column(Integer, nullable=False)
|
||||||
|
message_id = Column(String(500), nullable=False) # Email Message-ID Header
|
||||||
|
ordner = Column(String(200)) # IMAP Ordner
|
||||||
|
betreff = Column(String(500))
|
||||||
|
absender = Column(String(255))
|
||||||
|
anzahl_attachments = Column(Integer, default=0)
|
||||||
|
verarbeitet_am = Column(DateTime, default=datetime.utcnow)
|
||||||
|
|
||||||
|
|
||||||
|
class VerarbeiteteDatei(Base):
|
||||||
|
"""Log verarbeiteter Dateien"""
|
||||||
|
__tablename__ = "verarbeitete_dateien"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
original_pfad = Column(String(1000))
|
||||||
|
original_name = Column(String(500))
|
||||||
|
neuer_pfad = Column(String(1000))
|
||||||
|
neuer_name = Column(String(500))
|
||||||
|
|
||||||
|
ist_zugferd = Column(Boolean, default=False)
|
||||||
|
ocr_durchgefuehrt = Column(Boolean, default=False)
|
||||||
|
|
||||||
|
status = Column(String(50)) # sortiert, zugferd, fehler, keine_regel
|
||||||
|
fehler = Column(Text)
|
||||||
|
|
||||||
|
extrahierte_daten = Column(JSON)
|
||||||
|
verarbeitet_am = Column(DateTime, default=datetime.utcnow)
|
||||||
|
|
||||||
|
|
||||||
|
def migrate_db():
|
||||||
|
"""Fügt fehlende Spalten hinzu ohne Daten zu löschen"""
|
||||||
|
from sqlalchemy import inspect, text
|
||||||
|
|
||||||
|
inspector = inspect(engine)
|
||||||
|
|
||||||
|
# Migrations-Definitionen: {tabelle: {spalte: sql_typ}}
|
||||||
|
migrations = {
|
||||||
|
"postfaecher": {
|
||||||
|
"alle_ordner": "BOOLEAN DEFAULT 0",
|
||||||
|
"nur_ungelesen": "BOOLEAN DEFAULT 0"
|
||||||
|
},
|
||||||
|
"quell_ordner": {
|
||||||
|
"rekursiv": "BOOLEAN DEFAULT 1",
|
||||||
|
"dateitypen": "JSON"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
with engine.connect() as conn:
|
||||||
|
for table, columns in migrations.items():
|
||||||
|
if table not in inspector.get_table_names():
|
||||||
|
continue
|
||||||
|
|
||||||
|
existing = [col["name"] for col in inspector.get_columns(table)]
|
||||||
|
|
||||||
|
for col_name, col_type in columns.items():
|
||||||
|
if col_name not in existing:
|
||||||
|
try:
|
||||||
|
conn.execute(text(f"ALTER TABLE {table} ADD COLUMN {col_name} {col_type}"))
|
||||||
|
conn.commit()
|
||||||
|
print(f"Migration: {table}.{col_name} hinzugefügt")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Migration übersprungen: {table}.{col_name} - {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def init_db():
|
||||||
|
"""Datenbank initialisieren"""
|
||||||
|
Base.metadata.create_all(engine)
|
||||||
|
migrate_db()
|
||||||
|
|
||||||
|
|
||||||
|
def get_db():
|
||||||
|
"""Database Session Generator"""
|
||||||
|
db = SessionLocal()
|
||||||
|
try:
|
||||||
|
yield db
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
1
backend/app/modules/__init__.py
Normal file
1
backend/app/modules/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
# Module für die Pipeline-Verarbeitung
|
||||||
BIN
backend/app/modules/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
backend/app/modules/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/modules/__pycache__/extraktoren.cpython-313.pyc
Normal file
BIN
backend/app/modules/__pycache__/extraktoren.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/modules/__pycache__/mail_fetcher.cpython-313.pyc
Normal file
BIN
backend/app/modules/__pycache__/mail_fetcher.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/modules/__pycache__/pdf_processor.cpython-313.pyc
Normal file
BIN
backend/app/modules/__pycache__/pdf_processor.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/modules/__pycache__/sorter.cpython-313.pyc
Normal file
BIN
backend/app/modules/__pycache__/sorter.cpython-313.pyc
Normal file
Binary file not shown.
373
backend/app/modules/extraktoren.py
Normal file
373
backend/app/modules/extraktoren.py
Normal file
|
|
@ -0,0 +1,373 @@
|
||||||
|
"""
|
||||||
|
Globale Feld-Extraktoren mit Kaskaden-Regex
|
||||||
|
Werden automatisch als Fallback verwendet wenn regel-spezifische Muster nicht greifen
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional, List, Dict, Any
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# ============ DATUM ============
|
||||||
|
DATUM_MUSTER = [
|
||||||
|
# Mit Kontext (zuverlässiger)
|
||||||
|
{"regex": r"Rechnungsdatum[:\s]*(\d{2})[./](\d{2})[./](\d{4})", "order": "dmy"},
|
||||||
|
{"regex": r"Belegdatum[:\s]*(\d{2})[./](\d{2})[./](\d{4})", "order": "dmy"},
|
||||||
|
{"regex": r"Datum[:\s]*(\d{2})[./](\d{2})[./](\d{4})", "order": "dmy"},
|
||||||
|
{"regex": r"Date[:\s]*(\d{2})[./](\d{2})[./](\d{4})", "order": "dmy"},
|
||||||
|
{"regex": r"vom[:\s]*(\d{2})[./](\d{2})[./](\d{4})", "order": "dmy"},
|
||||||
|
|
||||||
|
# ISO Format
|
||||||
|
{"regex": r"(\d{4})-(\d{2})-(\d{2})", "order": "ymd"},
|
||||||
|
|
||||||
|
# Deutsches Format ohne Kontext
|
||||||
|
{"regex": r"(\d{2})\.(\d{2})\.(\d{4})", "order": "dmy"},
|
||||||
|
{"regex": r"(\d{2})/(\d{2})/(\d{4})", "order": "dmy"},
|
||||||
|
|
||||||
|
# Amerikanisches Format
|
||||||
|
{"regex": r"(\d{2})/(\d{2})/(\d{4})", "order": "mdy"},
|
||||||
|
|
||||||
|
# Ausgeschriebene Monate
|
||||||
|
{"regex": r"(\d{1,2})\.\s*(Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)\s*(\d{4})", "order": "dMy"},
|
||||||
|
{"regex": r"(\d{1,2})\s+(Jan|Feb|Mär|Apr|Mai|Jun|Jul|Aug|Sep|Okt|Nov|Dez)[a-z]*\.?\s+(\d{4})", "order": "dMy"},
|
||||||
|
]
|
||||||
|
|
||||||
|
MONATE_DE = {
|
||||||
|
"januar": 1, "februar": 2, "märz": 3, "april": 4, "mai": 5, "juni": 6,
|
||||||
|
"juli": 7, "august": 8, "september": 9, "oktober": 10, "november": 11, "dezember": 12,
|
||||||
|
"jan": 1, "feb": 2, "mär": 3, "apr": 4, "jun": 6, "jul": 7, "aug": 8, "sep": 9, "okt": 10, "nov": 11, "dez": 12
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extrahiere_datum(text: str, spezifische_muster: List[Dict] = None) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Extrahiert Datum aus Text mit Kaskaden-Ansatz
|
||||||
|
Returns: ISO Format YYYY-MM-DD oder None
|
||||||
|
"""
|
||||||
|
muster_liste = (spezifische_muster or []) + DATUM_MUSTER
|
||||||
|
|
||||||
|
for muster in muster_liste:
|
||||||
|
try:
|
||||||
|
match = re.search(muster["regex"], text, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
groups = match.groups()
|
||||||
|
order = muster.get("order", "dmy")
|
||||||
|
|
||||||
|
if order == "dmy":
|
||||||
|
tag, monat, jahr = int(groups[0]), int(groups[1]), int(groups[2])
|
||||||
|
elif order == "ymd":
|
||||||
|
jahr, monat, tag = int(groups[0]), int(groups[1]), int(groups[2])
|
||||||
|
elif order == "mdy":
|
||||||
|
monat, tag, jahr = int(groups[0]), int(groups[1]), int(groups[2])
|
||||||
|
elif order == "dMy":
|
||||||
|
tag = int(groups[0])
|
||||||
|
monat = MONATE_DE.get(groups[1].lower(), 1)
|
||||||
|
jahr = int(groups[2])
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Validierung
|
||||||
|
if 1 <= tag <= 31 and 1 <= monat <= 12 and 1900 <= jahr <= 2100:
|
||||||
|
return f"{jahr:04d}-{monat:02d}-{tag:02d}"
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Datum-Extraktion fehlgeschlagen: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ============ BETRAG ============
|
||||||
|
BETRAG_MUSTER = [
|
||||||
|
# Mit Kontext (zuverlässiger)
|
||||||
|
{"regex": r"Gesamtbetrag[:\s]*([\d.,]+)\s*(?:EUR|€)?", "context": True},
|
||||||
|
{"regex": r"Rechnungsbetrag[:\s]*([\d.,]+)\s*(?:EUR|€)?", "context": True},
|
||||||
|
{"regex": r"Endbetrag[:\s]*([\d.,]+)\s*(?:EUR|€)?", "context": True},
|
||||||
|
{"regex": r"Summe[:\s]*([\d.,]+)\s*(?:EUR|€)?", "context": True},
|
||||||
|
{"regex": r"Total[:\s]*([\d.,]+)\s*(?:EUR|€)?", "context": True},
|
||||||
|
{"regex": r"Brutto[:\s]*([\d.,]+)\s*(?:EUR|€)?", "context": True},
|
||||||
|
{"regex": r"zu zahlen[:\s]*([\d.,]+)\s*(?:EUR|€)?", "context": True},
|
||||||
|
{"regex": r"Zahlbetrag[:\s]*([\d.,]+)\s*(?:EUR|€)?", "context": True},
|
||||||
|
|
||||||
|
# Mit Währung (weniger zuverlässig)
|
||||||
|
{"regex": r"([\d.,]+)\s*(?:EUR|€)", "context": False},
|
||||||
|
{"regex": r"€\s*([\d.,]+)", "context": False},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def extrahiere_betrag(text: str, spezifische_muster: List[Dict] = None) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Extrahiert Betrag aus Text mit Kaskaden-Ansatz
|
||||||
|
Returns: Formatierter Betrag (z.B. "1234,56") oder None
|
||||||
|
"""
|
||||||
|
muster_liste = (spezifische_muster or []) + BETRAG_MUSTER
|
||||||
|
|
||||||
|
for muster in muster_liste:
|
||||||
|
try:
|
||||||
|
match = re.search(muster["regex"], text, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
betrag_str = match.group(1)
|
||||||
|
betrag = _parse_betrag(betrag_str)
|
||||||
|
if betrag is not None and betrag > 0:
|
||||||
|
# Formatierung: Ganzzahl wenn möglich, sonst 2 Dezimalstellen
|
||||||
|
if betrag == int(betrag):
|
||||||
|
return str(int(betrag))
|
||||||
|
return f"{betrag:.2f}".replace(".", ",")
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Betrag-Extraktion fehlgeschlagen: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_betrag(betrag_str: str) -> Optional[float]:
|
||||||
|
"""Parst Betrag-String zu Float"""
|
||||||
|
betrag_str = betrag_str.strip()
|
||||||
|
|
||||||
|
# Leerzeichen entfernen
|
||||||
|
betrag_str = betrag_str.replace(" ", "")
|
||||||
|
|
||||||
|
# Deutsches Format: 1.234,56 -> 1234.56
|
||||||
|
if "," in betrag_str and "." in betrag_str:
|
||||||
|
if betrag_str.rfind(",") > betrag_str.rfind("."):
|
||||||
|
# Deutsches Format
|
||||||
|
betrag_str = betrag_str.replace(".", "").replace(",", ".")
|
||||||
|
else:
|
||||||
|
# Englisches Format
|
||||||
|
betrag_str = betrag_str.replace(",", "")
|
||||||
|
elif "," in betrag_str:
|
||||||
|
# Nur Komma: deutsches Dezimaltrennzeichen
|
||||||
|
betrag_str = betrag_str.replace(",", ".")
|
||||||
|
|
||||||
|
try:
|
||||||
|
return float(betrag_str)
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ============ RECHNUNGSNUMMER ============
|
||||||
|
NUMMER_MUSTER = [
|
||||||
|
# Mit Kontext
|
||||||
|
{"regex": r"Rechnungsnummer[:\s#]*([A-Z0-9][\w\-/]+)", "context": True},
|
||||||
|
{"regex": r"Rechnung\s*Nr\.?[:\s#]*([A-Z0-9][\w\-/]+)", "context": True},
|
||||||
|
{"regex": r"Rechnungs-Nr\.?[:\s#]*([A-Z0-9][\w\-/]+)", "context": True},
|
||||||
|
{"regex": r"Invoice\s*(?:No\.?|Number)?[:\s#]*([A-Z0-9][\w\-/]+)", "context": True},
|
||||||
|
{"regex": r"Beleg-?Nr\.?[:\s#]*([A-Z0-9][\w\-/]+)", "context": True},
|
||||||
|
{"regex": r"Dokumentnummer[:\s#]*([A-Z0-9][\w\-/]+)", "context": True},
|
||||||
|
{"regex": r"Bestell-?Nr\.?[:\s#]*([A-Z0-9][\w\-/]+)", "context": True},
|
||||||
|
{"regex": r"Auftrags-?Nr\.?[:\s#]*([A-Z0-9][\w\-/]+)", "context": True},
|
||||||
|
|
||||||
|
# Typische Formate ohne Kontext
|
||||||
|
{"regex": r"RE-?(\d{4,})", "context": False},
|
||||||
|
{"regex": r"INV-?(\d{4,})", "context": False},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def extrahiere_nummer(text: str, spezifische_muster: List[Dict] = None) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Extrahiert Rechnungs-/Belegnummer aus Text
|
||||||
|
"""
|
||||||
|
muster_liste = (spezifische_muster or []) + NUMMER_MUSTER
|
||||||
|
|
||||||
|
for muster in muster_liste:
|
||||||
|
try:
|
||||||
|
match = re.search(muster["regex"], text, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
nummer = match.group(1).strip()
|
||||||
|
if len(nummer) >= 3: # Mindestens 3 Zeichen
|
||||||
|
return nummer
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Nummer-Extraktion fehlgeschlagen: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ============ FIRMA/ABSENDER ============
|
||||||
|
FIRMA_MUSTER = [
|
||||||
|
# Absender-Zeile
|
||||||
|
{"regex": r"^([A-ZÄÖÜ][A-Za-zäöüÄÖÜß\s&\-\.]+(?:GmbH|AG|KG|e\.K\.|Inc|Ltd|SE|UG))", "context": True},
|
||||||
|
{"regex": r"Absender[:\s]*([A-Za-zäöüÄÖÜß\s&\-\.]+)", "context": True},
|
||||||
|
{"regex": r"Von[:\s]*([A-Za-zäöüÄÖÜß\s&\-\.]+)", "context": True},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Bekannte Firmen (werden im Text gesucht)
|
||||||
|
BEKANNTE_FIRMEN = [
|
||||||
|
"Sonepar", "Amazon", "Ebay", "MediaMarkt", "Saturn", "Conrad", "Reichelt",
|
||||||
|
"Hornbach", "Bauhaus", "OBI", "Hagebau", "Toom", "Hellweg",
|
||||||
|
"Telekom", "Vodafone", "O2", "1&1",
|
||||||
|
"Allianz", "HUK", "Provinzial", "DEVK", "Gothaer",
|
||||||
|
"IKEA", "Poco", "XXXLutz", "Roller",
|
||||||
|
"Alternate", "Mindfactory", "Caseking", "Notebooksbilliger",
|
||||||
|
"DHL", "DPD", "Hermes", "UPS", "GLS",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def extrahiere_firma(text: str, absender_email: str = "", spezifische_muster: List[Dict] = None) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Extrahiert Firmennamen aus Text oder E-Mail-Absender
|
||||||
|
"""
|
||||||
|
text_lower = text.lower()
|
||||||
|
|
||||||
|
# 1. Bekannte Firmen im Text suchen
|
||||||
|
for firma in BEKANNTE_FIRMEN:
|
||||||
|
if firma.lower() in text_lower:
|
||||||
|
return firma
|
||||||
|
|
||||||
|
# 2. Aus E-Mail-Domain extrahieren
|
||||||
|
if absender_email:
|
||||||
|
match = re.search(r"@([\w\-]+)\.", absender_email)
|
||||||
|
if match:
|
||||||
|
domain = match.group(1)
|
||||||
|
# Bekannte Domain-Namen kapitalisieren
|
||||||
|
for firma in BEKANNTE_FIRMEN:
|
||||||
|
if firma.lower() == domain.lower():
|
||||||
|
return firma
|
||||||
|
return domain.capitalize()
|
||||||
|
|
||||||
|
# 3. Regex-Muster
|
||||||
|
muster_liste = (spezifische_muster or []) + FIRMA_MUSTER
|
||||||
|
for muster in muster_liste:
|
||||||
|
try:
|
||||||
|
match = re.search(muster["regex"], text, re.MULTILINE)
|
||||||
|
if match:
|
||||||
|
firma = match.group(1).strip()
|
||||||
|
if len(firma) >= 2:
|
||||||
|
return firma
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ============ DOKUMENTTYP ============
|
||||||
|
DOKUMENTTYP_KEYWORDS = {
|
||||||
|
"Rechnung": ["rechnung", "invoice", "faktura", "bill"],
|
||||||
|
"Angebot": ["angebot", "quotation", "quote", "offerte"],
|
||||||
|
"Gutschrift": ["gutschrift", "credit note", "erstattung"],
|
||||||
|
"Mahnung": ["mahnung", "zahlungserinnerung", "payment reminder"],
|
||||||
|
"Lieferschein": ["lieferschein", "delivery note", "packing slip"],
|
||||||
|
"Auftragsbestätigung": ["auftragsbestätigung", "order confirmation", "bestellbestätigung"],
|
||||||
|
"Vertrag": ["vertrag", "contract", "vereinbarung"],
|
||||||
|
"Versicherungsschein": ["versicherungsschein", "police", "versicherungspolice"],
|
||||||
|
"Zeugnis": ["zeugnis", "certificate", "zertifikat"],
|
||||||
|
"Bescheinigung": ["bescheinigung", "nachweis", "bestätigung"],
|
||||||
|
"Kontoauszug": ["kontoauszug", "account statement", "bankbeleg"],
|
||||||
|
"Beitragsrechnung": ["beitragsrechnung", "beitragsberechnung", "mitgliedsbeitrag"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extrahiere_dokumenttyp(text: str, dateiname: str = "") -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Erkennt den Dokumenttyp anhand von Keywords
|
||||||
|
"""
|
||||||
|
text_lower = text.lower() + " " + dateiname.lower()
|
||||||
|
|
||||||
|
for typ, keywords in DOKUMENTTYP_KEYWORDS.items():
|
||||||
|
for keyword in keywords:
|
||||||
|
if keyword in text_lower:
|
||||||
|
return typ
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ============ HAUPTFUNKTION ============
|
||||||
|
def extrahiere_alle_felder(text: str, dokument_info: Dict = None,
|
||||||
|
regel_extraktion: Dict = None) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Extrahiert alle verfügbaren Felder aus einem Dokument
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Der extrahierte Text aus dem PDF
|
||||||
|
dokument_info: Zusätzliche Infos (absender, original_name, etc.)
|
||||||
|
regel_extraktion: Spezifische Extraktionsregeln aus der Regel
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mit allen extrahierten Feldern
|
||||||
|
"""
|
||||||
|
dokument_info = dokument_info or {}
|
||||||
|
regel_extraktion = regel_extraktion or {}
|
||||||
|
|
||||||
|
felder = {}
|
||||||
|
|
||||||
|
# Datum
|
||||||
|
datum_muster = regel_extraktion.get("datum", {}).get("muster", [])
|
||||||
|
datum = extrahiere_datum(text, datum_muster if isinstance(datum_muster, list) else None)
|
||||||
|
if datum:
|
||||||
|
felder["datum"] = datum
|
||||||
|
|
||||||
|
# Betrag
|
||||||
|
betrag_muster = regel_extraktion.get("betrag", {}).get("muster", [])
|
||||||
|
betrag = extrahiere_betrag(text, betrag_muster if isinstance(betrag_muster, list) else None)
|
||||||
|
if betrag:
|
||||||
|
felder["betrag"] = betrag
|
||||||
|
|
||||||
|
# Nummer
|
||||||
|
nummer_muster = regel_extraktion.get("nummer", {}).get("muster", [])
|
||||||
|
nummer = extrahiere_nummer(text, nummer_muster if isinstance(nummer_muster, list) else None)
|
||||||
|
if nummer:
|
||||||
|
felder["nummer"] = nummer
|
||||||
|
|
||||||
|
# Firma
|
||||||
|
absender = dokument_info.get("absender", "")
|
||||||
|
firma = extrahiere_firma(text, absender)
|
||||||
|
if firma:
|
||||||
|
felder["firma"] = firma
|
||||||
|
|
||||||
|
# Dokumenttyp
|
||||||
|
dateiname = dokument_info.get("original_name", "")
|
||||||
|
typ = extrahiere_dokumenttyp(text, dateiname)
|
||||||
|
if typ:
|
||||||
|
felder["typ"] = typ
|
||||||
|
|
||||||
|
# Statische Werte aus Regel übernehmen
|
||||||
|
for feld_name, feld_config in regel_extraktion.items():
|
||||||
|
if isinstance(feld_config, dict) and "wert" in feld_config:
|
||||||
|
felder[feld_name] = feld_config["wert"]
|
||||||
|
|
||||||
|
return felder
|
||||||
|
|
||||||
|
|
||||||
|
# ============ SCHEMA-BUILDER ============
|
||||||
|
def baue_dateiname(schema: str, felder: Dict[str, Any], endung: str = ".pdf") -> str:
|
||||||
|
"""
|
||||||
|
Baut Dateinamen aus Schema und Feldern.
|
||||||
|
Entfernt automatisch Platzhalter und deren Trennzeichen wenn Feld fehlt.
|
||||||
|
|
||||||
|
Schema-Beispiel: "{datum} - {typ} - {firma} - {nummer} - {betrag} EUR"
|
||||||
|
Mit felder = {datum: "2026-10-01", typ: "Rechnung", firma: "Sonepar"}
|
||||||
|
Ergebnis: "2026-10-01 - Rechnung - Sonepar.pdf"
|
||||||
|
"""
|
||||||
|
# Schema ohne Endung verarbeiten
|
||||||
|
if schema.lower().endswith(".pdf"):
|
||||||
|
schema = schema[:-4]
|
||||||
|
|
||||||
|
# Platzhalter ersetzen
|
||||||
|
result = schema
|
||||||
|
for key, value in felder.items():
|
||||||
|
placeholder = "{" + key + "}"
|
||||||
|
if placeholder in result and value:
|
||||||
|
result = result.replace(placeholder, str(value))
|
||||||
|
|
||||||
|
# Nicht ersetzte Platzhalter und ihre Trennzeichen entfernen
|
||||||
|
# Muster: " - {feld}" oder "{feld} - " oder "{feld}"
|
||||||
|
result = re.sub(r'\s*-\s*\{[^}]+\}', '', result)
|
||||||
|
result = re.sub(r'\{[^}]+\}\s*-\s*', '', result)
|
||||||
|
result = re.sub(r'\{[^}]+\}', '', result)
|
||||||
|
|
||||||
|
# Aufräumen: Doppelte Trennzeichen, Leerzeichen
|
||||||
|
result = re.sub(r'\s*-\s*-\s*', ' - ', result)
|
||||||
|
result = re.sub(r'\s+', ' ', result)
|
||||||
|
result = result.strip(' -')
|
||||||
|
|
||||||
|
# Ungültige Zeichen entfernen
|
||||||
|
invalid_chars = '<>:"/\\|?*'
|
||||||
|
for char in invalid_chars:
|
||||||
|
result = result.replace(char, "_")
|
||||||
|
|
||||||
|
# Endung anhängen
|
||||||
|
if not result:
|
||||||
|
result = "Dokument"
|
||||||
|
|
||||||
|
return result + endung
|
||||||
392
backend/app/modules/mail_fetcher.py
Normal file
392
backend/app/modules/mail_fetcher.py
Normal file
|
|
@ -0,0 +1,392 @@
|
||||||
|
"""
|
||||||
|
Mail-Fetcher Modul
|
||||||
|
Holt Attachments aus IMAP-Postfächern
|
||||||
|
"""
|
||||||
|
import imaplib
|
||||||
|
import email
|
||||||
|
from email.header import decode_header
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import List, Dict, Optional
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from ..config import INBOX_DIR
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class MailFetcher:
|
||||||
|
"""Holt Attachments aus einem IMAP-Postfach"""
|
||||||
|
|
||||||
|
def __init__(self, config: Dict):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
config: Dict mit imap_server, imap_port, email, passwort, ordner,
|
||||||
|
erlaubte_typen, max_groesse_mb
|
||||||
|
"""
|
||||||
|
self.config = config
|
||||||
|
self.connection = None
|
||||||
|
|
||||||
|
def connect(self) -> bool:
|
||||||
|
"""Verbindung zum IMAP-Server herstellen"""
|
||||||
|
try:
|
||||||
|
self.connection = imaplib.IMAP4_SSL(
|
||||||
|
self.config["imap_server"],
|
||||||
|
self.config.get("imap_port", 993)
|
||||||
|
)
|
||||||
|
self.connection.login(
|
||||||
|
self.config["email"],
|
||||||
|
self.config["passwort"]
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"IMAP Verbindungsfehler: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def disconnect(self):
|
||||||
|
"""Verbindung trennen"""
|
||||||
|
if self.connection:
|
||||||
|
try:
|
||||||
|
self.connection.logout()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
self.connection = None
|
||||||
|
|
||||||
|
def liste_ordner(self) -> List[str]:
|
||||||
|
"""Listet alle verfügbaren IMAP-Ordner"""
|
||||||
|
if not self.connection:
|
||||||
|
if not self.connect():
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
status, folders = self.connection.list()
|
||||||
|
ordner_liste = []
|
||||||
|
if status == "OK":
|
||||||
|
for folder in folders:
|
||||||
|
if isinstance(folder, bytes):
|
||||||
|
# Format: (flags) "delimiter" "name"
|
||||||
|
parts = folder.decode().split(' "')
|
||||||
|
if len(parts) >= 3:
|
||||||
|
name = parts[-1].strip('"')
|
||||||
|
ordner_liste.append(name)
|
||||||
|
else:
|
||||||
|
# Fallback
|
||||||
|
ordner_liste.append(folder.decode().split()[-1].strip('"'))
|
||||||
|
return ordner_liste
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Fehler beim Auflisten der Ordner: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def fetch_attachments(self, ziel_ordner: Optional[Path] = None,
|
||||||
|
nur_ungelesen: bool = False,
|
||||||
|
markiere_gelesen: bool = False,
|
||||||
|
alle_ordner: bool = False,
|
||||||
|
bereits_verarbeitet: set = None) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Holt alle Attachments die den Filtern entsprechen
|
||||||
|
|
||||||
|
Args:
|
||||||
|
alle_ordner: Wenn True, werden ALLE IMAP-Ordner durchsucht
|
||||||
|
bereits_verarbeitet: Set von Message-IDs die übersprungen werden
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Liste von Dicts mit: pfad, original_name, absender, betreff, datum, groesse, message_id
|
||||||
|
"""
|
||||||
|
if not self.connection:
|
||||||
|
if not self.connect():
|
||||||
|
return []
|
||||||
|
|
||||||
|
ziel = ziel_ordner or INBOX_DIR
|
||||||
|
ziel.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
ergebnisse = []
|
||||||
|
erlaubte_typen = self.config.get("erlaubte_typen", [".pdf"])
|
||||||
|
max_groesse = self.config.get("max_groesse_mb", 25) * 1024 * 1024
|
||||||
|
bereits_verarbeitet = bereits_verarbeitet or set()
|
||||||
|
|
||||||
|
# Ordner bestimmen
|
||||||
|
if alle_ordner:
|
||||||
|
ordner_liste = self.liste_ordner()
|
||||||
|
logger.info(f"Durchsuche {len(ordner_liste)} Ordner")
|
||||||
|
else:
|
||||||
|
ordner_liste = [self.config.get("ordner", "INBOX")]
|
||||||
|
|
||||||
|
for ordner in ordner_liste:
|
||||||
|
ergebnisse.extend(self._fetch_from_folder(
|
||||||
|
ordner, ziel, erlaubte_typen, max_groesse,
|
||||||
|
nur_ungelesen, markiere_gelesen, bereits_verarbeitet
|
||||||
|
))
|
||||||
|
|
||||||
|
return ergebnisse
|
||||||
|
|
||||||
|
def _fetch_from_folder(self, ordner: str, ziel: Path,
|
||||||
|
erlaubte_typen: List[str], max_groesse: int,
|
||||||
|
nur_ungelesen: bool, markiere_gelesen: bool,
|
||||||
|
bereits_verarbeitet: set) -> List[Dict]:
|
||||||
|
"""Holt Attachments aus einem einzelnen Ordner"""
|
||||||
|
ergebnisse = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Ordner auswählen
|
||||||
|
status, _ = self.connection.select(ordner)
|
||||||
|
|
||||||
|
# Suche nach Mails
|
||||||
|
search_criteria = "(UNSEEN)" if nur_ungelesen else "ALL"
|
||||||
|
status, messages = self.connection.search(None, search_criteria)
|
||||||
|
|
||||||
|
if status != "OK":
|
||||||
|
logger.warning(f"Keine Mails gefunden in {ordner}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
mail_ids = messages[0].split()
|
||||||
|
logger.info(f"Gefunden: {len(mail_ids)} Mails in {ordner}")
|
||||||
|
|
||||||
|
for mail_id in mail_ids:
|
||||||
|
try:
|
||||||
|
# Mail abrufen
|
||||||
|
status, msg_data = self.connection.fetch(mail_id, "(RFC822)")
|
||||||
|
if status != "OK":
|
||||||
|
continue
|
||||||
|
|
||||||
|
msg = email.message_from_bytes(msg_data[0][1])
|
||||||
|
|
||||||
|
# Message-ID extrahieren und prüfen ob bereits verarbeitet
|
||||||
|
message_id = msg.get("Message-ID", "")
|
||||||
|
if message_id and message_id in bereits_verarbeitet:
|
||||||
|
continue # Bereits verarbeitet, überspringen
|
||||||
|
|
||||||
|
# Metadaten extrahieren
|
||||||
|
absender = self._decode_header(msg.get("From", ""))
|
||||||
|
betreff = self._decode_header(msg.get("Subject", ""))
|
||||||
|
datum = msg.get("Date", "")
|
||||||
|
|
||||||
|
# Attachments durchgehen
|
||||||
|
for part in msg.walk():
|
||||||
|
if part.get_content_maintype() == "multipart":
|
||||||
|
continue
|
||||||
|
|
||||||
|
filename = part.get_filename()
|
||||||
|
if not filename:
|
||||||
|
continue
|
||||||
|
|
||||||
|
filename = self._decode_header(filename)
|
||||||
|
datei_endung = Path(filename).suffix.lower()
|
||||||
|
|
||||||
|
# Filter prüfen
|
||||||
|
if datei_endung not in erlaubte_typen:
|
||||||
|
logger.debug(f"Überspringe {filename}: Typ {datei_endung} nicht erlaubt")
|
||||||
|
continue
|
||||||
|
|
||||||
|
payload = part.get_payload(decode=True)
|
||||||
|
if not payload:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(payload) > max_groesse:
|
||||||
|
logger.warning(f"Überspringe {filename}: Zu groß ({len(payload)} bytes)")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Speichern
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
safe_filename = self._safe_filename(filename)
|
||||||
|
ziel_pfad = ziel / f"{timestamp}_{safe_filename}"
|
||||||
|
|
||||||
|
# Eindeutigen Namen sicherstellen
|
||||||
|
counter = 1
|
||||||
|
while ziel_pfad.exists():
|
||||||
|
ziel_pfad = ziel / f"{timestamp}_{counter}_{safe_filename}"
|
||||||
|
counter += 1
|
||||||
|
|
||||||
|
ziel_pfad.write_bytes(payload)
|
||||||
|
|
||||||
|
ergebnisse.append({
|
||||||
|
"pfad": str(ziel_pfad),
|
||||||
|
"original_name": filename,
|
||||||
|
"absender": absender,
|
||||||
|
"betreff": betreff,
|
||||||
|
"datum": datum,
|
||||||
|
"groesse": len(payload),
|
||||||
|
"message_id": message_id,
|
||||||
|
"ordner": ordner
|
||||||
|
})
|
||||||
|
|
||||||
|
logger.info(f"Gespeichert: {ziel_pfad.name}")
|
||||||
|
|
||||||
|
# Als gelesen markieren
|
||||||
|
if markiere_gelesen and ergebnisse:
|
||||||
|
self.connection.store(mail_id, "+FLAGS", "\\Seen")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Fehler bei Mail {mail_id}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Fehler beim Abrufen: {e}")
|
||||||
|
|
||||||
|
return ergebnisse
|
||||||
|
|
||||||
|
def _decode_header(self, value: str) -> str:
|
||||||
|
"""Dekodiert Email-Header (kann encoded sein)"""
|
||||||
|
if not value:
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
decoded_parts = decode_header(value)
|
||||||
|
result = []
|
||||||
|
for part, charset in decoded_parts:
|
||||||
|
if isinstance(part, bytes):
|
||||||
|
result.append(part.decode(charset or "utf-8", errors="replace"))
|
||||||
|
else:
|
||||||
|
result.append(part)
|
||||||
|
return " ".join(result)
|
||||||
|
except:
|
||||||
|
return str(value)
|
||||||
|
|
||||||
|
def _safe_filename(self, filename: str) -> str:
|
||||||
|
"""Macht Dateinamen sicher für Dateisystem"""
|
||||||
|
# Ungültige Zeichen ersetzen
|
||||||
|
invalid_chars = '<>:"/\\|?*'
|
||||||
|
for char in invalid_chars:
|
||||||
|
filename = filename.replace(char, "_")
|
||||||
|
return filename.strip()
|
||||||
|
|
||||||
|
def fetch_attachments_generator(self, ziel_ordner: Optional[Path] = None,
|
||||||
|
nur_ungelesen: bool = False,
|
||||||
|
markiere_gelesen: bool = False,
|
||||||
|
alle_ordner: bool = False,
|
||||||
|
bereits_verarbeitet: set = None):
|
||||||
|
"""
|
||||||
|
Generator-Version für Streaming - yielded Events während des Abrufs
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Dict mit type: "ordner", "mails", "datei", "skip", "fehler"
|
||||||
|
"""
|
||||||
|
if not self.connection:
|
||||||
|
if not self.connect():
|
||||||
|
yield {"type": "fehler", "nachricht": "Verbindung fehlgeschlagen"}
|
||||||
|
return
|
||||||
|
|
||||||
|
ziel = ziel_ordner or INBOX_DIR
|
||||||
|
ziel.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
erlaubte_typen = self.config.get("erlaubte_typen", [".pdf"])
|
||||||
|
max_groesse = self.config.get("max_groesse_mb", 25) * 1024 * 1024
|
||||||
|
bereits_verarbeitet = bereits_verarbeitet or set()
|
||||||
|
|
||||||
|
# Ordner bestimmen
|
||||||
|
if alle_ordner:
|
||||||
|
ordner_liste = self.liste_ordner()
|
||||||
|
yield {"type": "info", "nachricht": f"{len(ordner_liste)} Ordner gefunden"}
|
||||||
|
else:
|
||||||
|
ordner_liste = [self.config.get("ordner", "INBOX")]
|
||||||
|
|
||||||
|
for ordner in ordner_liste:
|
||||||
|
yield {"type": "ordner", "name": ordner}
|
||||||
|
|
||||||
|
try:
|
||||||
|
status, _ = self.connection.select(ordner)
|
||||||
|
search_criteria = "(UNSEEN)" if nur_ungelesen else "ALL"
|
||||||
|
status, messages = self.connection.search(None, search_criteria)
|
||||||
|
|
||||||
|
if status != "OK":
|
||||||
|
continue
|
||||||
|
|
||||||
|
mail_ids = messages[0].split()
|
||||||
|
yield {"type": "mails", "ordner": ordner, "anzahl": len(mail_ids)}
|
||||||
|
|
||||||
|
for mail_id in mail_ids:
|
||||||
|
try:
|
||||||
|
status, msg_data = self.connection.fetch(mail_id, "(RFC822)")
|
||||||
|
if status != "OK":
|
||||||
|
continue
|
||||||
|
|
||||||
|
msg = email.message_from_bytes(msg_data[0][1])
|
||||||
|
message_id = msg.get("Message-ID", "")
|
||||||
|
|
||||||
|
if message_id and message_id in bereits_verarbeitet:
|
||||||
|
continue
|
||||||
|
|
||||||
|
absender = self._decode_header(msg.get("From", ""))
|
||||||
|
betreff = self._decode_header(msg.get("Subject", ""))
|
||||||
|
datum = msg.get("Date", "")
|
||||||
|
|
||||||
|
for part in msg.walk():
|
||||||
|
if part.get_content_maintype() == "multipart":
|
||||||
|
continue
|
||||||
|
|
||||||
|
filename = part.get_filename()
|
||||||
|
if not filename:
|
||||||
|
continue
|
||||||
|
|
||||||
|
filename = self._decode_header(filename)
|
||||||
|
datei_endung = Path(filename).suffix.lower()
|
||||||
|
|
||||||
|
if datei_endung not in erlaubte_typen:
|
||||||
|
continue
|
||||||
|
|
||||||
|
payload = part.get_payload(decode=True)
|
||||||
|
if not payload:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(payload) > max_groesse:
|
||||||
|
yield {"type": "skip", "datei": filename, "grund": "zu groß"}
|
||||||
|
continue
|
||||||
|
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
safe_filename = self._safe_filename(filename)
|
||||||
|
ziel_pfad = ziel / f"{timestamp}_{safe_filename}"
|
||||||
|
|
||||||
|
counter = 1
|
||||||
|
while ziel_pfad.exists():
|
||||||
|
ziel_pfad = ziel / f"{timestamp}_{counter}_{safe_filename}"
|
||||||
|
counter += 1
|
||||||
|
|
||||||
|
ziel_pfad.write_bytes(payload)
|
||||||
|
|
||||||
|
yield {
|
||||||
|
"type": "datei",
|
||||||
|
"pfad": str(ziel_pfad),
|
||||||
|
"original_name": filename,
|
||||||
|
"absender": absender,
|
||||||
|
"betreff": betreff[:100] if betreff else "",
|
||||||
|
"datum": datum,
|
||||||
|
"groesse": len(payload),
|
||||||
|
"message_id": message_id,
|
||||||
|
"ordner": ordner
|
||||||
|
}
|
||||||
|
|
||||||
|
if markiere_gelesen:
|
||||||
|
self.connection.store(mail_id, "+FLAGS", "\\Seen")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
yield {"type": "fehler", "nachricht": f"Mail-Fehler: {str(e)[:100]}"}
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
yield {"type": "fehler", "nachricht": f"Ordner-Fehler {ordner}: {str(e)[:100]}"}
|
||||||
|
|
||||||
|
def test_connection(self) -> Dict:
|
||||||
|
"""Testet die Verbindung und gibt Status zurück"""
|
||||||
|
try:
|
||||||
|
if self.connect():
|
||||||
|
# Ordner auflisten
|
||||||
|
status, folders = self.connection.list()
|
||||||
|
ordner_liste = []
|
||||||
|
if status == "OK":
|
||||||
|
for folder in folders:
|
||||||
|
if isinstance(folder, bytes):
|
||||||
|
ordner_liste.append(folder.decode())
|
||||||
|
self.disconnect()
|
||||||
|
return {
|
||||||
|
"erfolg": True,
|
||||||
|
"nachricht": "Verbindung erfolgreich",
|
||||||
|
"ordner": ordner_liste
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
"erfolg": False,
|
||||||
|
"nachricht": "Verbindung fehlgeschlagen"
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"erfolg": False,
|
||||||
|
"nachricht": str(e)
|
||||||
|
}
|
||||||
248
backend/app/modules/pdf_processor.py
Normal file
248
backend/app/modules/pdf_processor.py
Normal file
|
|
@ -0,0 +1,248 @@
|
||||||
|
"""
|
||||||
|
PDF-Processor Modul
|
||||||
|
Text-Extraktion, OCR und ZUGFeRD-Erkennung
|
||||||
|
"""
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Optional, Tuple
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Versuche Libraries zu importieren
|
||||||
|
try:
|
||||||
|
import pdfplumber
|
||||||
|
PDFPLUMBER_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
PDFPLUMBER_AVAILABLE = False
|
||||||
|
logger.warning("pdfplumber nicht installiert")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from pypdf import PdfReader
|
||||||
|
PYPDF_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
PYPDF_AVAILABLE = False
|
||||||
|
logger.warning("pypdf nicht installiert")
|
||||||
|
|
||||||
|
|
||||||
|
class PDFProcessor:
|
||||||
|
"""Verarbeitet PDFs: Text-Extraktion, OCR, ZUGFeRD-Erkennung"""
|
||||||
|
|
||||||
|
def __init__(self, ocr_language: str = "deu", ocr_dpi: int = 300):
|
||||||
|
self.ocr_language = ocr_language
|
||||||
|
self.ocr_dpi = ocr_dpi
|
||||||
|
|
||||||
|
def verarbeite(self, pdf_pfad: str) -> Dict:
|
||||||
|
"""
|
||||||
|
Vollständige PDF-Verarbeitung
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mit: text, ist_zugferd, zugferd_xml, hat_text, ocr_durchgefuehrt
|
||||||
|
"""
|
||||||
|
pfad = Path(pdf_pfad)
|
||||||
|
if not pfad.exists():
|
||||||
|
return {"fehler": f"Datei nicht gefunden: {pdf_pfad}"}
|
||||||
|
|
||||||
|
ergebnis = {
|
||||||
|
"pfad": str(pfad),
|
||||||
|
"text": "",
|
||||||
|
"ist_zugferd": False,
|
||||||
|
"zugferd_xml": None,
|
||||||
|
"hat_text": False,
|
||||||
|
"ocr_durchgefuehrt": False,
|
||||||
|
"seiten": 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# 1. ZUGFeRD prüfen
|
||||||
|
zugferd_result = self.pruefe_zugferd(pdf_pfad)
|
||||||
|
ergebnis["ist_zugferd"] = zugferd_result["ist_zugferd"]
|
||||||
|
ergebnis["zugferd_xml"] = zugferd_result.get("xml")
|
||||||
|
|
||||||
|
# 2. Text extrahieren
|
||||||
|
text, seiten = self.extrahiere_text(pdf_pfad)
|
||||||
|
ergebnis["text"] = text
|
||||||
|
ergebnis["seiten"] = seiten
|
||||||
|
ergebnis["hat_text"] = bool(text and len(text.strip()) > 50)
|
||||||
|
|
||||||
|
# 3. OCR falls kein Text (aber NICHT bei ZUGFeRD!)
|
||||||
|
if not ergebnis["hat_text"] and not ergebnis["ist_zugferd"]:
|
||||||
|
logger.info(f"Kein Text gefunden, starte OCR für {pfad.name}")
|
||||||
|
ocr_text, ocr_erfolg = self.fuehre_ocr_aus(pdf_pfad)
|
||||||
|
if ocr_erfolg:
|
||||||
|
ergebnis["text"] = ocr_text
|
||||||
|
ergebnis["hat_text"] = bool(ocr_text and len(ocr_text.strip()) > 50)
|
||||||
|
ergebnis["ocr_durchgefuehrt"] = True
|
||||||
|
|
||||||
|
return ergebnis
|
||||||
|
|
||||||
|
def extrahiere_text(self, pdf_pfad: str) -> Tuple[str, int]:
|
||||||
|
"""
|
||||||
|
Extrahiert Text aus PDF
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple von (text, seitenanzahl)
|
||||||
|
"""
|
||||||
|
text_parts = []
|
||||||
|
seiten = 0
|
||||||
|
|
||||||
|
# Methode 1: pdfplumber (besser für Tabellen)
|
||||||
|
if PDFPLUMBER_AVAILABLE:
|
||||||
|
try:
|
||||||
|
with pdfplumber.open(pdf_pfad) as pdf:
|
||||||
|
seiten = len(pdf.pages)
|
||||||
|
for page in pdf.pages:
|
||||||
|
page_text = page.extract_text()
|
||||||
|
if page_text:
|
||||||
|
text_parts.append(page_text)
|
||||||
|
if text_parts:
|
||||||
|
return "\n\n".join(text_parts), seiten
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"pdfplumber Fehler: {e}")
|
||||||
|
|
||||||
|
# Methode 2: pypdf (Fallback)
|
||||||
|
if PYPDF_AVAILABLE:
|
||||||
|
try:
|
||||||
|
reader = PdfReader(pdf_pfad)
|
||||||
|
seiten = len(reader.pages)
|
||||||
|
for page in reader.pages:
|
||||||
|
page_text = page.extract_text()
|
||||||
|
if page_text:
|
||||||
|
text_parts.append(page_text)
|
||||||
|
if text_parts:
|
||||||
|
return "\n\n".join(text_parts), seiten
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"pypdf Fehler: {e}")
|
||||||
|
|
||||||
|
# Methode 3: pdftotext CLI (Fallback)
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["pdftotext", "-layout", pdf_pfad, "-"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
if result.returncode == 0 and result.stdout.strip():
|
||||||
|
return result.stdout, seiten
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"pdftotext Fehler: {e}")
|
||||||
|
|
||||||
|
return "", seiten
|
||||||
|
|
||||||
|
def pruefe_zugferd(self, pdf_pfad: str) -> Dict:
|
||||||
|
"""
|
||||||
|
Prüft ob PDF eine ZUGFeRD/Factur-X Rechnung ist
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mit: ist_zugferd, xml (falls vorhanden)
|
||||||
|
"""
|
||||||
|
ergebnis = {"ist_zugferd": False, "xml": None}
|
||||||
|
|
||||||
|
# Methode 1: factur-x Library
|
||||||
|
try:
|
||||||
|
from facturx import get_facturx_xml_from_pdf
|
||||||
|
xml_bytes = get_facturx_xml_from_pdf(pdf_pfad)
|
||||||
|
if xml_bytes:
|
||||||
|
ergebnis["ist_zugferd"] = True
|
||||||
|
ergebnis["xml"] = xml_bytes.decode("utf-8") if isinstance(xml_bytes, bytes) else xml_bytes
|
||||||
|
logger.info(f"ZUGFeRD erkannt: {Path(pdf_pfad).name}")
|
||||||
|
return ergebnis
|
||||||
|
except ImportError:
|
||||||
|
logger.debug("factur-x nicht installiert")
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"factur-x Fehler: {e}")
|
||||||
|
|
||||||
|
# Methode 2: Manuell nach XML-Attachment suchen
|
||||||
|
if PYPDF_AVAILABLE:
|
||||||
|
try:
|
||||||
|
reader = PdfReader(pdf_pfad)
|
||||||
|
if "/Names" in reader.trailer.get("/Root", {}):
|
||||||
|
# Embedded Files prüfen
|
||||||
|
pass # Komplexere Logik hier
|
||||||
|
|
||||||
|
# Alternativ: Im Text nach ZUGFeRD-Markern suchen
|
||||||
|
for page in reader.pages[:1]: # Nur erste Seite
|
||||||
|
text = page.extract_text() or ""
|
||||||
|
if any(marker in text.upper() for marker in ["ZUGFERD", "FACTUR-X", "EN 16931"]):
|
||||||
|
ergebnis["ist_zugferd"] = True
|
||||||
|
logger.info(f"ZUGFeRD-Marker gefunden: {Path(pdf_pfad).name}")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"ZUGFeRD-Prüfung Fehler: {e}")
|
||||||
|
|
||||||
|
return ergebnis
|
||||||
|
|
||||||
|
def fuehre_ocr_aus(self, pdf_pfad: str) -> Tuple[str, bool]:
|
||||||
|
"""
|
||||||
|
Führt OCR mit ocrmypdf durch
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple von (text, erfolg)
|
||||||
|
"""
|
||||||
|
pfad = Path(pdf_pfad)
|
||||||
|
temp_pfad = pfad.with_suffix(".ocr.pdf")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# ocrmypdf ausführen
|
||||||
|
result = subprocess.run(
|
||||||
|
[
|
||||||
|
"ocrmypdf",
|
||||||
|
"--language", self.ocr_language,
|
||||||
|
"--deskew", # Schräge Scans korrigieren
|
||||||
|
"--clean", # Bild verbessern
|
||||||
|
"--skip-text", # Seiten mit Text überspringen
|
||||||
|
"--force-ocr", # OCR erzwingen falls nötig
|
||||||
|
str(pfad),
|
||||||
|
str(temp_pfad)
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=120 # 2 Minuten Timeout
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode == 0 and temp_pfad.exists():
|
||||||
|
# Original mit OCR-Version ersetzen
|
||||||
|
pfad.unlink()
|
||||||
|
temp_pfad.rename(pfad)
|
||||||
|
|
||||||
|
# Text aus OCR-PDF extrahieren
|
||||||
|
text, _ = self.extrahiere_text(str(pfad))
|
||||||
|
return text, True
|
||||||
|
else:
|
||||||
|
logger.error(f"OCR Fehler: {result.stderr}")
|
||||||
|
if temp_pfad.exists():
|
||||||
|
temp_pfad.unlink()
|
||||||
|
return "", False
|
||||||
|
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
logger.error(f"OCR Timeout für {pfad.name}")
|
||||||
|
if temp_pfad.exists():
|
||||||
|
temp_pfad.unlink()
|
||||||
|
return "", False
|
||||||
|
except FileNotFoundError:
|
||||||
|
logger.error("ocrmypdf nicht installiert")
|
||||||
|
return "", False
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"OCR Fehler: {e}")
|
||||||
|
if temp_pfad.exists():
|
||||||
|
temp_pfad.unlink()
|
||||||
|
return "", False
|
||||||
|
|
||||||
|
def extrahiere_metadaten(self, pdf_pfad: str) -> Dict:
|
||||||
|
"""Extrahiert PDF-Metadaten"""
|
||||||
|
metadaten = {}
|
||||||
|
|
||||||
|
if PYPDF_AVAILABLE:
|
||||||
|
try:
|
||||||
|
reader = PdfReader(pdf_pfad)
|
||||||
|
if reader.metadata:
|
||||||
|
metadaten = {
|
||||||
|
"titel": reader.metadata.get("/Title", ""),
|
||||||
|
"autor": reader.metadata.get("/Author", ""),
|
||||||
|
"ersteller": reader.metadata.get("/Creator", ""),
|
||||||
|
"erstellt": reader.metadata.get("/CreationDate", ""),
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Metadaten-Fehler: {e}")
|
||||||
|
|
||||||
|
return metadaten
|
||||||
323
backend/app/modules/sorter.py
Normal file
323
backend/app/modules/sorter.py
Normal file
|
|
@ -0,0 +1,323 @@
|
||||||
|
"""
|
||||||
|
Sorter Modul
|
||||||
|
Regel-basierte Erkennung und Benennung von Dokumenten
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, List, Optional, Any
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
from .extraktoren import extrahiere_alle_felder, baue_dateiname
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Sorter:
|
||||||
|
"""Sortiert und benennt Dokumente basierend auf Regeln"""
|
||||||
|
|
||||||
|
def __init__(self, regeln: List[Dict]):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
regeln: Liste von Regel-Dicts, sortiert nach Priorität
|
||||||
|
"""
|
||||||
|
# Nach Priorität sortieren (niedrig = wichtig)
|
||||||
|
self.regeln = sorted(regeln, key=lambda r: r.get("prioritaet", 100))
|
||||||
|
|
||||||
|
def finde_passende_regel(self, dokument_info: Dict) -> Optional[Dict]:
|
||||||
|
"""
|
||||||
|
Findet die erste passende Regel für ein Dokument
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dokument_info: Dict mit text, original_name, absender, etc.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Passende Regel oder None
|
||||||
|
"""
|
||||||
|
for regel in self.regeln:
|
||||||
|
if not regel.get("aktiv", True):
|
||||||
|
continue
|
||||||
|
|
||||||
|
muster = regel.get("muster", {})
|
||||||
|
if self._pruefe_muster(muster, dokument_info):
|
||||||
|
logger.info(f"Regel '{regel.get('name')}' matched für {dokument_info.get('original_name')}")
|
||||||
|
return regel
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _pruefe_muster(self, muster: Dict, dokument_info: Dict) -> bool:
|
||||||
|
"""Prüft ob alle Muster auf das Dokument zutreffen"""
|
||||||
|
text = dokument_info.get("text", "").lower()
|
||||||
|
original_name = dokument_info.get("original_name", "").lower()
|
||||||
|
absender = dokument_info.get("absender", "").lower()
|
||||||
|
|
||||||
|
# keywords (einfache Komma-getrennte Liste - für UI)
|
||||||
|
if "keywords" in muster:
|
||||||
|
keywords = muster["keywords"]
|
||||||
|
if isinstance(keywords, str):
|
||||||
|
keywords = [k.strip() for k in keywords.split(",")]
|
||||||
|
# Alle Keywords müssen vorkommen
|
||||||
|
for keyword in keywords:
|
||||||
|
keyword = keyword.lower().strip()
|
||||||
|
if keyword and keyword not in text and keyword not in original_name:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# absender_contains
|
||||||
|
if "absender_contains" in muster:
|
||||||
|
if muster["absender_contains"].lower() not in absender:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# dateiname_match
|
||||||
|
if "dateiname_match" in muster:
|
||||||
|
pattern = muster["dateiname_match"]
|
||||||
|
if isinstance(pattern, str):
|
||||||
|
if pattern.lower() not in original_name:
|
||||||
|
return False
|
||||||
|
elif isinstance(pattern, list):
|
||||||
|
if not any(p.lower() in original_name for p in pattern):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# text_match (alle müssen enthalten sein)
|
||||||
|
if "text_match" in muster:
|
||||||
|
patterns = muster["text_match"]
|
||||||
|
if isinstance(patterns, str):
|
||||||
|
patterns = [patterns]
|
||||||
|
for pattern in patterns:
|
||||||
|
if pattern.lower() not in text:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# text_match_any (mindestens einer muss enthalten sein)
|
||||||
|
if "text_match_any" in muster:
|
||||||
|
patterns = muster["text_match_any"]
|
||||||
|
if isinstance(patterns, str):
|
||||||
|
patterns = [patterns]
|
||||||
|
if not any(p.lower() in text for p in patterns):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# text_regex
|
||||||
|
if "text_regex" in muster:
|
||||||
|
pattern = muster["text_regex"]
|
||||||
|
if not re.search(pattern, text, re.IGNORECASE):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def extrahiere_felder(self, regel: Dict, dokument_info: Dict) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Extrahiert Felder aus dem Dokument - nutzt globale Extraktoren mit Fallbacks
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mit extrahierten Werten
|
||||||
|
"""
|
||||||
|
text = dokument_info.get("text", "")
|
||||||
|
regel_extraktion = regel.get("extraktion", {})
|
||||||
|
|
||||||
|
# Globale Extraktoren nutzen (mit Regel-spezifischen Überschreibungen)
|
||||||
|
felder = extrahiere_alle_felder(text, dokument_info, regel_extraktion)
|
||||||
|
|
||||||
|
# Regel-spezifische statische Werte überschreiben
|
||||||
|
for feld_name, feld_config in regel_extraktion.items():
|
||||||
|
if isinstance(feld_config, dict):
|
||||||
|
if "wert" in feld_config:
|
||||||
|
felder[feld_name] = feld_config["wert"]
|
||||||
|
elif "regex" in feld_config:
|
||||||
|
# Einzelnes Regex aus Regel
|
||||||
|
wert = self._extrahiere_mit_regex(feld_config, text)
|
||||||
|
if wert:
|
||||||
|
felder[feld_name] = wert
|
||||||
|
elif isinstance(feld_config, str):
|
||||||
|
# Direkter statischer Wert
|
||||||
|
felder[feld_name] = feld_config
|
||||||
|
|
||||||
|
return felder
|
||||||
|
|
||||||
|
def _extrahiere_mit_regex(self, config: Dict, text: str) -> Optional[str]:
|
||||||
|
"""Extrahiert ein Feld mit einem einzelnen Regex"""
|
||||||
|
try:
|
||||||
|
match = re.search(config["regex"], text, re.IGNORECASE | re.MULTILINE)
|
||||||
|
if match:
|
||||||
|
wert = match.group(1) if match.groups() else match.group(0)
|
||||||
|
|
||||||
|
# Datum formatieren
|
||||||
|
if "format" in config:
|
||||||
|
try:
|
||||||
|
datum = datetime.strptime(wert.strip(), config["format"])
|
||||||
|
return datum.strftime("%Y-%m-%d")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Betrag formatieren
|
||||||
|
if config.get("typ") == "betrag":
|
||||||
|
wert = self._formatiere_betrag(wert)
|
||||||
|
|
||||||
|
return wert.strip()
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Regex-Extraktion fehlgeschlagen: {e}")
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _formatiere_betrag(self, betrag: str) -> str:
|
||||||
|
"""Formatiert Betrag einheitlich"""
|
||||||
|
betrag = betrag.replace(" ", "").replace(".", "").replace(",", ".")
|
||||||
|
|
||||||
|
try:
|
||||||
|
wert = float(betrag)
|
||||||
|
if wert == int(wert):
|
||||||
|
return str(int(wert))
|
||||||
|
return f"{wert:.2f}".replace(".", ",")
|
||||||
|
except:
|
||||||
|
return betrag
|
||||||
|
|
||||||
|
def generiere_dateinamen(self, regel: Dict, extrahierte_felder: Dict) -> str:
|
||||||
|
"""
|
||||||
|
Generiert den neuen Dateinamen basierend auf Schema
|
||||||
|
Nutzt den intelligenten Schema-Builder der fehlende Felder entfernt
|
||||||
|
"""
|
||||||
|
schema = regel.get("schema", "{datum} - Dokument.pdf")
|
||||||
|
return baue_dateiname(schema, extrahierte_felder, ".pdf")
|
||||||
|
|
||||||
|
def verschiebe_datei(self, quell_pfad: str, ziel_ordner: str, neuer_name: str) -> str:
|
||||||
|
"""
|
||||||
|
Verschiebt und benennt Datei um
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Neuer Pfad der Datei
|
||||||
|
"""
|
||||||
|
ziel_dir = Path(ziel_ordner)
|
||||||
|
ziel_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
ziel_pfad = ziel_dir / neuer_name
|
||||||
|
|
||||||
|
# Eindeutigen Namen sicherstellen
|
||||||
|
counter = 1
|
||||||
|
original_name = ziel_pfad.stem
|
||||||
|
suffix = ziel_pfad.suffix
|
||||||
|
while ziel_pfad.exists():
|
||||||
|
ziel_pfad = ziel_dir / f"{original_name} ({counter}){suffix}"
|
||||||
|
counter += 1
|
||||||
|
|
||||||
|
# Verschieben
|
||||||
|
shutil.move(quell_pfad, ziel_pfad)
|
||||||
|
logger.info(f"Verschoben: {quell_pfad} -> {ziel_pfad}")
|
||||||
|
|
||||||
|
return str(ziel_pfad)
|
||||||
|
|
||||||
|
|
||||||
|
# ============ STANDARD-DOKUMENTTYPEN ============
|
||||||
|
# Diese werden für das einfache UI verwendet
|
||||||
|
|
||||||
|
DOKUMENTTYPEN = {
|
||||||
|
"rechnung": {
|
||||||
|
"name": "Rechnung",
|
||||||
|
"keywords": ["rechnung", "invoice"],
|
||||||
|
"schema": "{datum} - Rechnung - {firma} - {nummer} - {betrag} EUR.pdf",
|
||||||
|
"unterordner": "rechnungen"
|
||||||
|
},
|
||||||
|
"angebot": {
|
||||||
|
"name": "Angebot",
|
||||||
|
"keywords": ["angebot", "quotation", "offerte"],
|
||||||
|
"schema": "{datum} - Angebot - {firma} - {nummer} - {betrag} EUR.pdf",
|
||||||
|
"unterordner": "angebote"
|
||||||
|
},
|
||||||
|
"gutschrift": {
|
||||||
|
"name": "Gutschrift",
|
||||||
|
"keywords": ["gutschrift", "credit"],
|
||||||
|
"schema": "{datum} - Gutschrift - {firma} - {nummer} - {betrag} EUR.pdf",
|
||||||
|
"unterordner": "gutschriften"
|
||||||
|
},
|
||||||
|
"lieferschein": {
|
||||||
|
"name": "Lieferschein",
|
||||||
|
"keywords": ["lieferschein", "delivery"],
|
||||||
|
"schema": "{datum} - Lieferschein - {firma} - {nummer}.pdf",
|
||||||
|
"unterordner": "lieferscheine"
|
||||||
|
},
|
||||||
|
"auftragsbestaetigung": {
|
||||||
|
"name": "Auftragsbestätigung",
|
||||||
|
"keywords": ["auftragsbestätigung", "bestellbestätigung"],
|
||||||
|
"schema": "{datum} - Auftragsbestätigung - {firma} - {nummer}.pdf",
|
||||||
|
"unterordner": "auftraege"
|
||||||
|
},
|
||||||
|
"vertrag": {
|
||||||
|
"name": "Vertrag",
|
||||||
|
"keywords": ["vertrag", "contract"],
|
||||||
|
"schema": "Vertrag - {firma} - {nummer} - {datum}.pdf",
|
||||||
|
"unterordner": "vertraege"
|
||||||
|
},
|
||||||
|
"versicherung": {
|
||||||
|
"name": "Versicherung",
|
||||||
|
"keywords": ["versicherung", "police", "beitrag"],
|
||||||
|
"schema": "Versicherung - {firma} - {nummer} - {datum}.pdf",
|
||||||
|
"unterordner": "versicherungen"
|
||||||
|
},
|
||||||
|
"zeugnis": {
|
||||||
|
"name": "Zeugnis",
|
||||||
|
"keywords": ["zeugnis", "zertifikat"],
|
||||||
|
"schema": "Zeugnis - {firma} - {nummer} - {datum}.pdf",
|
||||||
|
"unterordner": "zeugnisse"
|
||||||
|
},
|
||||||
|
"bescheinigung": {
|
||||||
|
"name": "Bescheinigung",
|
||||||
|
"keywords": ["bescheinigung", "nachweis", "bestätigung"],
|
||||||
|
"schema": "Bescheinigung - {firma} - {nummer} - {datum}.pdf",
|
||||||
|
"unterordner": "bescheinigungen"
|
||||||
|
},
|
||||||
|
"kontoauszug": {
|
||||||
|
"name": "Kontoauszug",
|
||||||
|
"keywords": ["kontoauszug", "account statement"],
|
||||||
|
"schema": "{datum} - Kontoauszug - {firma} - {nummer}.pdf",
|
||||||
|
"unterordner": "kontoauszuege"
|
||||||
|
},
|
||||||
|
"sonstiges": {
|
||||||
|
"name": "Sonstiges",
|
||||||
|
"keywords": [],
|
||||||
|
"schema": "{datum} - {typ} - {firma}.pdf",
|
||||||
|
"unterordner": "sonstiges"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def erstelle_einfache_regel(name: str, dokumenttyp: str, keywords: str,
|
||||||
|
firma_wert: str = None, unterordner: str = None,
|
||||||
|
prioritaet: int = 50) -> Dict:
|
||||||
|
"""
|
||||||
|
Erstellt eine einfache Regel basierend auf Dokumenttyp
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Name der Regel (z.B. "Sonepar Rechnung")
|
||||||
|
dokumenttyp: Typ aus DOKUMENTTYPEN
|
||||||
|
keywords: Komma-getrennte Keywords zur Erkennung
|
||||||
|
firma_wert: Optionaler fester Firmenwert
|
||||||
|
unterordner: Optionaler Unterordner (überschreibt Standard)
|
||||||
|
prioritaet: Priorität (niedriger = wichtiger)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Regel-Dict für die Datenbank
|
||||||
|
"""
|
||||||
|
typ_config = DOKUMENTTYPEN.get(dokumenttyp, DOKUMENTTYPEN["sonstiges"])
|
||||||
|
|
||||||
|
regel = {
|
||||||
|
"name": name,
|
||||||
|
"prioritaet": prioritaet,
|
||||||
|
"aktiv": True,
|
||||||
|
"muster": {
|
||||||
|
"keywords": keywords
|
||||||
|
},
|
||||||
|
"extraktion": {},
|
||||||
|
"schema": typ_config["schema"],
|
||||||
|
"unterordner": unterordner or typ_config["unterordner"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Feste Firma wenn angegeben
|
||||||
|
if firma_wert:
|
||||||
|
regel["extraktion"]["firma"] = {"wert": firma_wert}
|
||||||
|
|
||||||
|
return regel
|
||||||
|
|
||||||
|
|
||||||
|
def liste_dokumenttypen() -> List[Dict]:
|
||||||
|
"""Gibt Liste aller Dokumenttypen für UI zurück"""
|
||||||
|
return [
|
||||||
|
{"id": key, "name": config["name"], "schema": config["schema"]}
|
||||||
|
for key, config in DOKUMENTTYPEN.items()
|
||||||
|
]
|
||||||
1
backend/app/routes/__init__.py
Normal file
1
backend/app/routes/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
# API Routes
|
||||||
BIN
backend/app/routes/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
backend/app/routes/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/routes/__pycache__/api.cpython-313.pyc
Normal file
BIN
backend/app/routes/__pycache__/api.cpython-313.pyc
Normal file
Binary file not shown.
851
backend/app/routes/api.py
Normal file
851
backend/app/routes/api.py
Normal file
|
|
@ -0,0 +1,851 @@
|
||||||
|
"""
|
||||||
|
API Routes - Getrennte Bereiche: Mail-Abruf und Datei-Sortierung
|
||||||
|
"""
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
from typing import List, Optional
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
import json
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
from ..models.database import get_db, Postfach, QuellOrdner, SortierRegel, VerarbeiteteDatei, VerarbeiteteMail
|
||||||
|
from ..modules.mail_fetcher import MailFetcher
|
||||||
|
from ..modules.pdf_processor import PDFProcessor
|
||||||
|
from ..modules.sorter import Sorter
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api", tags=["api"])
|
||||||
|
|
||||||
|
|
||||||
|
# ============ Pydantic Models ============
|
||||||
|
|
||||||
|
class PostfachCreate(BaseModel):
|
||||||
|
name: str
|
||||||
|
imap_server: str
|
||||||
|
imap_port: int = 993
|
||||||
|
email: str
|
||||||
|
passwort: str
|
||||||
|
ordner: str = "INBOX"
|
||||||
|
alle_ordner: bool = False # Alle IMAP-Ordner durchsuchen
|
||||||
|
nur_ungelesen: bool = False # Nur ungelesene Mails (False = alle)
|
||||||
|
ziel_ordner: str
|
||||||
|
erlaubte_typen: List[str] = [".pdf"]
|
||||||
|
max_groesse_mb: int = 25
|
||||||
|
|
||||||
|
|
||||||
|
class PostfachResponse(BaseModel):
|
||||||
|
id: int
|
||||||
|
name: str
|
||||||
|
imap_server: str
|
||||||
|
email: str
|
||||||
|
ordner: str
|
||||||
|
alle_ordner: bool
|
||||||
|
nur_ungelesen: bool
|
||||||
|
ziel_ordner: str
|
||||||
|
erlaubte_typen: List[str]
|
||||||
|
max_groesse_mb: int
|
||||||
|
letzter_abruf: Optional[datetime]
|
||||||
|
letzte_anzahl: int
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
from_attributes = True
|
||||||
|
|
||||||
|
|
||||||
|
class OrdnerCreate(BaseModel):
|
||||||
|
name: str
|
||||||
|
pfad: str
|
||||||
|
ziel_ordner: str
|
||||||
|
rekursiv: bool = True
|
||||||
|
dateitypen: List[str] = [".pdf", ".jpg", ".jpeg", ".png", ".tiff"]
|
||||||
|
|
||||||
|
|
||||||
|
class OrdnerResponse(BaseModel):
|
||||||
|
id: int
|
||||||
|
name: str
|
||||||
|
pfad: str
|
||||||
|
ziel_ordner: str
|
||||||
|
rekursiv: bool
|
||||||
|
dateitypen: List[str]
|
||||||
|
aktiv: bool
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
from_attributes = True
|
||||||
|
|
||||||
|
|
||||||
|
class RegelCreate(BaseModel):
|
||||||
|
name: str
|
||||||
|
prioritaet: int = 100
|
||||||
|
muster: dict = {}
|
||||||
|
extraktion: dict = {}
|
||||||
|
schema: str = "{datum} - Dokument.pdf"
|
||||||
|
unterordner: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class RegelResponse(BaseModel):
|
||||||
|
id: int
|
||||||
|
name: str
|
||||||
|
prioritaet: int
|
||||||
|
aktiv: bool
|
||||||
|
muster: dict
|
||||||
|
extraktion: dict
|
||||||
|
schema: str
|
||||||
|
unterordner: Optional[str]
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
from_attributes = True
|
||||||
|
|
||||||
|
|
||||||
|
class RegelTestRequest(BaseModel):
|
||||||
|
regel: dict
|
||||||
|
text: str
|
||||||
|
|
||||||
|
|
||||||
|
# ============ Verzeichnis-Browser ============
|
||||||
|
|
||||||
|
@router.get("/browse")
|
||||||
|
def browse_directory(path: str = "/"):
|
||||||
|
"""Listet Verzeichnisse für File-Browser"""
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Sicherheit: Nur bestimmte Basispfade erlauben
|
||||||
|
allowed_bases = ["/srv", "/home", "/mnt", "/media", "/data", "/tmp"]
|
||||||
|
path = os.path.abspath(path)
|
||||||
|
|
||||||
|
# Prüfen ob Pfad erlaubt
|
||||||
|
is_allowed = any(path.startswith(base) for base in allowed_bases) or path == "/"
|
||||||
|
if not is_allowed:
|
||||||
|
return {"error": "Pfad nicht erlaubt", "entries": []}
|
||||||
|
|
||||||
|
if not os.path.exists(path):
|
||||||
|
return {"error": "Pfad existiert nicht", "entries": []}
|
||||||
|
|
||||||
|
if not os.path.isdir(path):
|
||||||
|
return {"error": "Kein Verzeichnis", "entries": []}
|
||||||
|
|
||||||
|
try:
|
||||||
|
entries = []
|
||||||
|
for entry in sorted(os.listdir(path)):
|
||||||
|
full_path = os.path.join(path, entry)
|
||||||
|
if os.path.isdir(full_path):
|
||||||
|
entries.append({
|
||||||
|
"name": entry,
|
||||||
|
"path": full_path,
|
||||||
|
"type": "directory"
|
||||||
|
})
|
||||||
|
|
||||||
|
return {
|
||||||
|
"current": path,
|
||||||
|
"parent": os.path.dirname(path) if path != "/" else None,
|
||||||
|
"entries": entries
|
||||||
|
}
|
||||||
|
except PermissionError:
|
||||||
|
return {"error": "Zugriff verweigert", "entries": []}
|
||||||
|
|
||||||
|
|
||||||
|
# ============ BEREICH 1: Postfächer ============
|
||||||
|
|
||||||
|
@router.get("/postfaecher", response_model=List[PostfachResponse])
|
||||||
|
def liste_postfaecher(db: Session = Depends(get_db)):
|
||||||
|
return db.query(Postfach).all()
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/postfaecher", response_model=PostfachResponse)
|
||||||
|
def erstelle_postfach(data: PostfachCreate, db: Session = Depends(get_db)):
|
||||||
|
postfach = Postfach(**data.dict())
|
||||||
|
db.add(postfach)
|
||||||
|
db.commit()
|
||||||
|
db.refresh(postfach)
|
||||||
|
return postfach
|
||||||
|
|
||||||
|
|
||||||
|
@router.put("/postfaecher/{id}", response_model=PostfachResponse)
|
||||||
|
def aktualisiere_postfach(id: int, data: PostfachCreate, db: Session = Depends(get_db)):
|
||||||
|
postfach = db.query(Postfach).filter(Postfach.id == id).first()
|
||||||
|
if not postfach:
|
||||||
|
raise HTTPException(status_code=404, detail="Nicht gefunden")
|
||||||
|
|
||||||
|
update_data = data.dict()
|
||||||
|
# Passwort nur aktualisieren wenn nicht leer
|
||||||
|
if not update_data.get("passwort"):
|
||||||
|
del update_data["passwort"]
|
||||||
|
|
||||||
|
for key, value in update_data.items():
|
||||||
|
setattr(postfach, key, value)
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
db.refresh(postfach)
|
||||||
|
return postfach
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/postfaecher/{id}")
|
||||||
|
def loesche_postfach(id: int, db: Session = Depends(get_db)):
|
||||||
|
postfach = db.query(Postfach).filter(Postfach.id == id).first()
|
||||||
|
if not postfach:
|
||||||
|
raise HTTPException(status_code=404, detail="Nicht gefunden")
|
||||||
|
db.delete(postfach)
|
||||||
|
db.commit()
|
||||||
|
return {"message": "Gelöscht"}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/postfaecher/{id}/test")
|
||||||
|
def teste_postfach(id: int, db: Session = Depends(get_db)):
|
||||||
|
postfach = db.query(Postfach).filter(Postfach.id == id).first()
|
||||||
|
if not postfach:
|
||||||
|
raise HTTPException(status_code=404, detail="Nicht gefunden")
|
||||||
|
|
||||||
|
fetcher = MailFetcher({
|
||||||
|
"imap_server": postfach.imap_server,
|
||||||
|
"imap_port": postfach.imap_port,
|
||||||
|
"email": postfach.email,
|
||||||
|
"passwort": postfach.passwort,
|
||||||
|
"ordner": postfach.ordner
|
||||||
|
})
|
||||||
|
return fetcher.test_connection()
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/postfaecher/{id}/abrufen/stream")
|
||||||
|
def rufe_postfach_ab_stream(id: int, db: Session = Depends(get_db)):
|
||||||
|
"""Streaming-Endpoint für Mail-Abruf mit Live-Updates"""
|
||||||
|
postfach = db.query(Postfach).filter(Postfach.id == id).first()
|
||||||
|
if not postfach:
|
||||||
|
raise HTTPException(status_code=404, detail="Nicht gefunden")
|
||||||
|
|
||||||
|
# Daten kopieren für Generator (Session ist nach return nicht mehr verfügbar)
|
||||||
|
pf_data = {
|
||||||
|
"id": postfach.id,
|
||||||
|
"name": postfach.name,
|
||||||
|
"imap_server": postfach.imap_server,
|
||||||
|
"imap_port": postfach.imap_port,
|
||||||
|
"email": postfach.email,
|
||||||
|
"passwort": postfach.passwort,
|
||||||
|
"ordner": postfach.ordner,
|
||||||
|
"alle_ordner": postfach.alle_ordner,
|
||||||
|
"erlaubte_typen": postfach.erlaubte_typen,
|
||||||
|
"max_groesse_mb": postfach.max_groesse_mb,
|
||||||
|
"ziel_ordner": postfach.ziel_ordner
|
||||||
|
}
|
||||||
|
|
||||||
|
# Bereits verarbeitete Message-IDs laden
|
||||||
|
bereits_verarbeitet = set(
|
||||||
|
row.message_id for row in
|
||||||
|
db.query(VerarbeiteteMail.message_id)
|
||||||
|
.filter(VerarbeiteteMail.postfach_id == id)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
def event_generator():
|
||||||
|
from ..models.database import SessionLocal
|
||||||
|
|
||||||
|
def send_event(data):
|
||||||
|
return f"data: {json.dumps(data)}\n\n"
|
||||||
|
|
||||||
|
yield send_event({"type": "start", "postfach": pf_data["name"], "bereits_verarbeitet": len(bereits_verarbeitet)})
|
||||||
|
|
||||||
|
# Zielordner erstellen
|
||||||
|
ziel = Path(pf_data["ziel_ordner"])
|
||||||
|
ziel.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
fetcher = MailFetcher({
|
||||||
|
"imap_server": pf_data["imap_server"],
|
||||||
|
"imap_port": pf_data["imap_port"],
|
||||||
|
"email": pf_data["email"],
|
||||||
|
"passwort": pf_data["passwort"],
|
||||||
|
"ordner": pf_data["ordner"],
|
||||||
|
"erlaubte_typen": pf_data["erlaubte_typen"],
|
||||||
|
"max_groesse_mb": pf_data["max_groesse_mb"]
|
||||||
|
})
|
||||||
|
|
||||||
|
attachments = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Generator für streaming
|
||||||
|
for event in fetcher.fetch_attachments_generator(
|
||||||
|
ziel,
|
||||||
|
nur_ungelesen=False,
|
||||||
|
alle_ordner=pf_data["alle_ordner"],
|
||||||
|
bereits_verarbeitet=bereits_verarbeitet
|
||||||
|
):
|
||||||
|
yield send_event(event)
|
||||||
|
|
||||||
|
if event.get("type") == "datei":
|
||||||
|
attachments.append(event)
|
||||||
|
|
||||||
|
# DB-Session für Speicherung
|
||||||
|
session = SessionLocal()
|
||||||
|
try:
|
||||||
|
verarbeitete_msg_ids = set()
|
||||||
|
for att in attachments:
|
||||||
|
msg_id = att.get("message_id")
|
||||||
|
if msg_id and msg_id not in verarbeitete_msg_ids:
|
||||||
|
verarbeitete_msg_ids.add(msg_id)
|
||||||
|
session.add(VerarbeiteteMail(
|
||||||
|
postfach_id=pf_data["id"],
|
||||||
|
message_id=msg_id,
|
||||||
|
ordner=att.get("ordner", ""),
|
||||||
|
betreff=att.get("betreff", "")[:500] if att.get("betreff") else None,
|
||||||
|
absender=att.get("absender", "")[:255] if att.get("absender") else None,
|
||||||
|
anzahl_attachments=1
|
||||||
|
))
|
||||||
|
|
||||||
|
# Postfach aktualisieren
|
||||||
|
pf = session.query(Postfach).filter(Postfach.id == pf_data["id"]).first()
|
||||||
|
if pf:
|
||||||
|
pf.letzter_abruf = datetime.utcnow()
|
||||||
|
pf.letzte_anzahl = len(attachments)
|
||||||
|
session.commit()
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
yield send_event({"type": "fertig", "anzahl": len(attachments)})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
yield send_event({"type": "fehler", "nachricht": str(e)})
|
||||||
|
finally:
|
||||||
|
fetcher.disconnect()
|
||||||
|
|
||||||
|
return StreamingResponse(
|
||||||
|
event_generator(),
|
||||||
|
media_type="text/event-stream",
|
||||||
|
headers={
|
||||||
|
"Cache-Control": "no-cache",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"X-Accel-Buffering": "no"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/postfaecher/{id}/abrufen")
|
||||||
|
def rufe_postfach_ab(id: int, db: Session = Depends(get_db)):
|
||||||
|
postfach = db.query(Postfach).filter(Postfach.id == id).first()
|
||||||
|
if not postfach:
|
||||||
|
raise HTTPException(status_code=404, detail="Nicht gefunden")
|
||||||
|
|
||||||
|
# Bereits verarbeitete Message-IDs laden
|
||||||
|
bereits_verarbeitet = set(
|
||||||
|
row.message_id for row in
|
||||||
|
db.query(VerarbeiteteMail.message_id)
|
||||||
|
.filter(VerarbeiteteMail.postfach_id == id)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Zielordner erstellen
|
||||||
|
ziel = Path(postfach.ziel_ordner)
|
||||||
|
ziel.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
fetcher = MailFetcher({
|
||||||
|
"imap_server": postfach.imap_server,
|
||||||
|
"imap_port": postfach.imap_port,
|
||||||
|
"email": postfach.email,
|
||||||
|
"passwort": postfach.passwort,
|
||||||
|
"ordner": postfach.ordner,
|
||||||
|
"erlaubte_typen": postfach.erlaubte_typen,
|
||||||
|
"max_groesse_mb": postfach.max_groesse_mb
|
||||||
|
})
|
||||||
|
|
||||||
|
try:
|
||||||
|
attachments = fetcher.fetch_attachments(
|
||||||
|
ziel,
|
||||||
|
nur_ungelesen=False, # Alle Mails durchsuchen
|
||||||
|
alle_ordner=postfach.alle_ordner,
|
||||||
|
bereits_verarbeitet=bereits_verarbeitet
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verarbeitete Mails in DB speichern
|
||||||
|
verarbeitete_msg_ids = set()
|
||||||
|
for att in attachments:
|
||||||
|
msg_id = att.get("message_id")
|
||||||
|
if msg_id and msg_id not in verarbeitete_msg_ids:
|
||||||
|
verarbeitete_msg_ids.add(msg_id)
|
||||||
|
db.add(VerarbeiteteMail(
|
||||||
|
postfach_id=id,
|
||||||
|
message_id=msg_id,
|
||||||
|
ordner=att.get("ordner", ""),
|
||||||
|
betreff=att.get("betreff", "")[:500] if att.get("betreff") else None,
|
||||||
|
absender=att.get("absender", "")[:255] if att.get("absender") else None,
|
||||||
|
anzahl_attachments=1
|
||||||
|
))
|
||||||
|
|
||||||
|
postfach.letzter_abruf = datetime.utcnow()
|
||||||
|
postfach.letzte_anzahl = len(attachments)
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"ergebnisse": [{
|
||||||
|
"postfach": postfach.name,
|
||||||
|
"anzahl": len(attachments),
|
||||||
|
"dateien": [a["original_name"] for a in attachments],
|
||||||
|
"bereits_verarbeitet": len(bereits_verarbeitet)
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"ergebnisse": [{
|
||||||
|
"postfach": postfach.name,
|
||||||
|
"fehler": str(e)
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
finally:
|
||||||
|
fetcher.disconnect()
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/postfaecher/abrufen-alle")
|
||||||
|
def rufe_alle_postfaecher_ab(db: Session = Depends(get_db)):
|
||||||
|
postfaecher = db.query(Postfach).filter(Postfach.aktiv == True).all()
|
||||||
|
ergebnisse = []
|
||||||
|
|
||||||
|
for postfach in postfaecher:
|
||||||
|
ziel = Path(postfach.ziel_ordner)
|
||||||
|
ziel.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
fetcher = MailFetcher({
|
||||||
|
"imap_server": postfach.imap_server,
|
||||||
|
"imap_port": postfach.imap_port,
|
||||||
|
"email": postfach.email,
|
||||||
|
"passwort": postfach.passwort,
|
||||||
|
"ordner": postfach.ordner,
|
||||||
|
"erlaubte_typen": postfach.erlaubte_typen,
|
||||||
|
"max_groesse_mb": postfach.max_groesse_mb
|
||||||
|
})
|
||||||
|
|
||||||
|
try:
|
||||||
|
attachments = fetcher.fetch_attachments(ziel)
|
||||||
|
postfach.letzter_abruf = datetime.utcnow()
|
||||||
|
postfach.letzte_anzahl = len(attachments)
|
||||||
|
|
||||||
|
ergebnisse.append({
|
||||||
|
"postfach": postfach.name,
|
||||||
|
"anzahl": len(attachments),
|
||||||
|
"dateien": [a["original_name"] for a in attachments]
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
ergebnisse.append({
|
||||||
|
"postfach": postfach.name,
|
||||||
|
"fehler": str(e)
|
||||||
|
})
|
||||||
|
finally:
|
||||||
|
fetcher.disconnect()
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
return {"ergebnisse": ergebnisse}
|
||||||
|
|
||||||
|
|
||||||
|
# ============ BEREICH 2: Quell-Ordner ============
|
||||||
|
|
||||||
|
@router.get("/ordner", response_model=List[OrdnerResponse])
|
||||||
|
def liste_ordner(db: Session = Depends(get_db)):
|
||||||
|
return db.query(QuellOrdner).all()
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/ordner", response_model=OrdnerResponse)
|
||||||
|
def erstelle_ordner(data: OrdnerCreate, db: Session = Depends(get_db)):
|
||||||
|
ordner = QuellOrdner(**data.dict())
|
||||||
|
db.add(ordner)
|
||||||
|
db.commit()
|
||||||
|
db.refresh(ordner)
|
||||||
|
return ordner
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/ordner/{id}")
|
||||||
|
def loesche_ordner(id: int, db: Session = Depends(get_db)):
|
||||||
|
ordner = db.query(QuellOrdner).filter(QuellOrdner.id == id).first()
|
||||||
|
if not ordner:
|
||||||
|
raise HTTPException(status_code=404, detail="Nicht gefunden")
|
||||||
|
db.delete(ordner)
|
||||||
|
db.commit()
|
||||||
|
return {"message": "Gelöscht"}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/ordner/{id}/scannen")
|
||||||
|
def scanne_ordner(id: int, db: Session = Depends(get_db)):
|
||||||
|
ordner = db.query(QuellOrdner).filter(QuellOrdner.id == id).first()
|
||||||
|
if not ordner:
|
||||||
|
raise HTTPException(status_code=404, detail="Nicht gefunden")
|
||||||
|
|
||||||
|
pfad = Path(ordner.pfad)
|
||||||
|
if not pfad.exists():
|
||||||
|
return {"anzahl": 0, "fehler": "Ordner existiert nicht"}
|
||||||
|
|
||||||
|
# Dateien sammeln (rekursiv oder nicht)
|
||||||
|
dateien = []
|
||||||
|
pattern = "**/*" if ordner.rekursiv else "*"
|
||||||
|
for f in pfad.glob(pattern):
|
||||||
|
if f.is_file() and f.suffix.lower() in [t.lower() for t in ordner.dateitypen]:
|
||||||
|
dateien.append(f)
|
||||||
|
|
||||||
|
return {"anzahl": len(dateien), "dateien": [str(f.relative_to(pfad)) for f in dateien[:30]]}
|
||||||
|
|
||||||
|
|
||||||
|
# ============ Regeln ============
|
||||||
|
|
||||||
|
@router.get("/regeln", response_model=List[RegelResponse])
|
||||||
|
def liste_regeln(db: Session = Depends(get_db)):
|
||||||
|
return db.query(SortierRegel).order_by(SortierRegel.prioritaet).all()
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/regeln", response_model=RegelResponse)
|
||||||
|
def erstelle_regel(data: RegelCreate, db: Session = Depends(get_db)):
|
||||||
|
regel = SortierRegel(**data.dict())
|
||||||
|
db.add(regel)
|
||||||
|
db.commit()
|
||||||
|
db.refresh(regel)
|
||||||
|
return regel
|
||||||
|
|
||||||
|
|
||||||
|
@router.put("/regeln/{id}", response_model=RegelResponse)
|
||||||
|
def aktualisiere_regel(id: int, data: RegelCreate, db: Session = Depends(get_db)):
|
||||||
|
regel = db.query(SortierRegel).filter(SortierRegel.id == id).first()
|
||||||
|
if not regel:
|
||||||
|
raise HTTPException(status_code=404, detail="Nicht gefunden")
|
||||||
|
for key, value in data.dict().items():
|
||||||
|
setattr(regel, key, value)
|
||||||
|
db.commit()
|
||||||
|
db.refresh(regel)
|
||||||
|
return regel
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/regeln/{id}")
|
||||||
|
def loesche_regel(id: int, db: Session = Depends(get_db)):
|
||||||
|
regel = db.query(SortierRegel).filter(SortierRegel.id == id).first()
|
||||||
|
if not regel:
|
||||||
|
raise HTTPException(status_code=404, detail="Nicht gefunden")
|
||||||
|
db.delete(regel)
|
||||||
|
db.commit()
|
||||||
|
return {"message": "Gelöscht"}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/regeln/test")
|
||||||
|
def teste_regel(data: RegelTestRequest):
|
||||||
|
regel = data.regel
|
||||||
|
regel["aktiv"] = True
|
||||||
|
regel["prioritaet"] = 1
|
||||||
|
|
||||||
|
sorter = Sorter([regel])
|
||||||
|
doc_info = {"text": data.text, "original_name": "test.pdf", "absender": ""}
|
||||||
|
|
||||||
|
passend = sorter.finde_passende_regel(doc_info)
|
||||||
|
|
||||||
|
if passend:
|
||||||
|
extrahiert = sorter.extrahiere_felder(passend, doc_info)
|
||||||
|
dateiname = sorter.generiere_dateinamen(passend, extrahiert)
|
||||||
|
return {"passt": True, "extrahiert": extrahiert, "dateiname": dateiname}
|
||||||
|
|
||||||
|
return {"passt": False}
|
||||||
|
|
||||||
|
|
||||||
|
# ============ Sortierung ============
|
||||||
|
|
||||||
|
def sammle_dateien(ordner: QuellOrdner) -> list:
|
||||||
|
"""Sammelt alle Dateien aus einem Ordner (rekursiv oder nicht)"""
|
||||||
|
pfad = Path(ordner.pfad)
|
||||||
|
if not pfad.exists():
|
||||||
|
return []
|
||||||
|
|
||||||
|
dateien = []
|
||||||
|
pattern = "**/*" if ordner.rekursiv else "*"
|
||||||
|
erlaubte = [t.lower() for t in (ordner.dateitypen or [".pdf"])]
|
||||||
|
|
||||||
|
for f in pfad.glob(pattern):
|
||||||
|
if f.is_file() and f.suffix.lower() in erlaubte:
|
||||||
|
dateien.append(f)
|
||||||
|
|
||||||
|
return dateien
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sortierung/starten")
|
||||||
|
def starte_sortierung(db: Session = Depends(get_db)):
|
||||||
|
ordner_liste = db.query(QuellOrdner).filter(QuellOrdner.aktiv == True).all()
|
||||||
|
regeln = db.query(SortierRegel).filter(SortierRegel.aktiv == True).order_by(SortierRegel.prioritaet).all()
|
||||||
|
|
||||||
|
if not ordner_liste:
|
||||||
|
return {"fehler": "Keine Quell-Ordner konfiguriert", "verarbeitet": []}
|
||||||
|
if not regeln:
|
||||||
|
return {"fehler": "Keine Regeln definiert", "verarbeitet": []}
|
||||||
|
|
||||||
|
# Regeln in Dict-Format
|
||||||
|
regeln_dicts = []
|
||||||
|
for r in regeln:
|
||||||
|
regeln_dicts.append({
|
||||||
|
"id": r.id,
|
||||||
|
"name": r.name,
|
||||||
|
"prioritaet": r.prioritaet,
|
||||||
|
"muster": r.muster,
|
||||||
|
"extraktion": r.extraktion,
|
||||||
|
"schema": r.schema,
|
||||||
|
"unterordner": r.unterordner
|
||||||
|
})
|
||||||
|
|
||||||
|
sorter = Sorter(regeln_dicts)
|
||||||
|
pdf_processor = PDFProcessor()
|
||||||
|
|
||||||
|
ergebnis = {
|
||||||
|
"gesamt": 0,
|
||||||
|
"sortiert": 0,
|
||||||
|
"zugferd": 0,
|
||||||
|
"fehler": 0,
|
||||||
|
"verarbeitet": []
|
||||||
|
}
|
||||||
|
|
||||||
|
for quell_ordner in ordner_liste:
|
||||||
|
pfad = Path(quell_ordner.pfad)
|
||||||
|
if not pfad.exists():
|
||||||
|
continue
|
||||||
|
|
||||||
|
ziel_basis = Path(quell_ordner.ziel_ordner)
|
||||||
|
dateien = sammle_dateien(quell_ordner)
|
||||||
|
|
||||||
|
for datei in dateien:
|
||||||
|
ergebnis["gesamt"] += 1
|
||||||
|
# Relativer Pfad für Anzeige
|
||||||
|
try:
|
||||||
|
rel_pfad = str(datei.relative_to(pfad))
|
||||||
|
except:
|
||||||
|
rel_pfad = datei.name
|
||||||
|
datei_info = {"original": rel_pfad}
|
||||||
|
|
||||||
|
try:
|
||||||
|
ist_pdf = datei.suffix.lower() == ".pdf"
|
||||||
|
text = ""
|
||||||
|
ist_zugferd = False
|
||||||
|
ocr_gemacht = False
|
||||||
|
|
||||||
|
# Nur PDFs durch den PDF-Processor
|
||||||
|
if ist_pdf:
|
||||||
|
pdf_result = pdf_processor.verarbeite(str(datei))
|
||||||
|
|
||||||
|
if pdf_result.get("fehler"):
|
||||||
|
raise Exception(pdf_result["fehler"])
|
||||||
|
|
||||||
|
text = pdf_result.get("text", "")
|
||||||
|
ist_zugferd = pdf_result.get("ist_zugferd", False)
|
||||||
|
ocr_gemacht = pdf_result.get("ocr_durchgefuehrt", False)
|
||||||
|
|
||||||
|
# ZUGFeRD separat behandeln
|
||||||
|
if ist_zugferd:
|
||||||
|
zugferd_ziel = ziel_basis / "zugferd"
|
||||||
|
zugferd_ziel.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
neuer_pfad = zugferd_ziel / datei.name
|
||||||
|
counter = 1
|
||||||
|
while neuer_pfad.exists():
|
||||||
|
neuer_pfad = zugferd_ziel / f"{datei.stem}_{counter}{datei.suffix}"
|
||||||
|
counter += 1
|
||||||
|
|
||||||
|
datei.rename(neuer_pfad)
|
||||||
|
|
||||||
|
ergebnis["zugferd"] += 1
|
||||||
|
datei_info["zugferd"] = True
|
||||||
|
datei_info["neuer_name"] = neuer_pfad.name
|
||||||
|
|
||||||
|
db.add(VerarbeiteteDatei(
|
||||||
|
original_pfad=str(datei),
|
||||||
|
original_name=datei.name,
|
||||||
|
neuer_pfad=str(neuer_pfad),
|
||||||
|
neuer_name=neuer_pfad.name,
|
||||||
|
ist_zugferd=True,
|
||||||
|
status="zugferd"
|
||||||
|
))
|
||||||
|
ergebnis["verarbeitet"].append(datei_info)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Regel finden (für PDFs mit Text, für andere nur Dateiname)
|
||||||
|
doc_info = {
|
||||||
|
"text": text,
|
||||||
|
"original_name": datei.name,
|
||||||
|
"absender": "",
|
||||||
|
"dateityp": datei.suffix.lower()
|
||||||
|
}
|
||||||
|
|
||||||
|
regel = sorter.finde_passende_regel(doc_info)
|
||||||
|
|
||||||
|
if not regel:
|
||||||
|
datei_info["fehler"] = "Keine passende Regel"
|
||||||
|
ergebnis["fehler"] += 1
|
||||||
|
ergebnis["verarbeitet"].append(datei_info)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Felder extrahieren
|
||||||
|
extrahiert = sorter.extrahiere_felder(regel, doc_info)
|
||||||
|
|
||||||
|
# Dateiendung beibehalten
|
||||||
|
schema = regel.get("schema", "{datum} - Dokument.pdf")
|
||||||
|
# Endung aus Schema entfernen und Original-Endung anhängen
|
||||||
|
if schema.endswith(".pdf"):
|
||||||
|
schema = schema[:-4] + datei.suffix
|
||||||
|
neuer_name = sorter.generiere_dateinamen({"schema": schema, **regel}, extrahiert)
|
||||||
|
|
||||||
|
# Zielordner
|
||||||
|
ziel = ziel_basis
|
||||||
|
if regel.get("unterordner"):
|
||||||
|
ziel = ziel / regel["unterordner"]
|
||||||
|
ziel.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Verschieben
|
||||||
|
neuer_pfad = sorter.verschiebe_datei(str(datei), str(ziel), neuer_name)
|
||||||
|
|
||||||
|
ergebnis["sortiert"] += 1
|
||||||
|
datei_info["neuer_name"] = neuer_name
|
||||||
|
|
||||||
|
db.add(VerarbeiteteDatei(
|
||||||
|
original_pfad=str(datei),
|
||||||
|
original_name=datei.name,
|
||||||
|
neuer_pfad=neuer_pfad,
|
||||||
|
neuer_name=neuer_name,
|
||||||
|
ist_zugferd=False,
|
||||||
|
ocr_durchgefuehrt=ocr_gemacht,
|
||||||
|
status="sortiert",
|
||||||
|
extrahierte_daten=extrahiert
|
||||||
|
))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
ergebnis["fehler"] += 1
|
||||||
|
datei_info["fehler"] = str(e)
|
||||||
|
|
||||||
|
ergebnis["verarbeitet"].append(datei_info)
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
return ergebnis
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/health")
|
||||||
|
def health():
|
||||||
|
return {"status": "ok"}
|
||||||
|
|
||||||
|
|
||||||
|
# ============ Einfache Regeln (UI-freundlich) ============
|
||||||
|
|
||||||
|
@router.get("/dokumenttypen")
|
||||||
|
def liste_dokumenttypen():
|
||||||
|
"""Gibt alle verfügbaren Dokumenttypen für das UI zurück"""
|
||||||
|
from ..modules.sorter import DOKUMENTTYPEN
|
||||||
|
return [
|
||||||
|
{"id": key, "name": config["name"], "schema": config["schema"], "unterordner": config["unterordner"]}
|
||||||
|
for key, config in DOKUMENTTYPEN.items()
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class EinfacheRegelCreate(BaseModel):
|
||||||
|
name: str
|
||||||
|
dokumenttyp: str # z.B. "rechnung", "vertrag"
|
||||||
|
keywords: str # Komma-getrennt
|
||||||
|
firma: Optional[str] = None # Fester Firmenwert
|
||||||
|
unterordner: Optional[str] = None
|
||||||
|
prioritaet: int = 50
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/regeln/einfach")
|
||||||
|
def erstelle_einfache_regel_api(data: EinfacheRegelCreate, db: Session = Depends(get_db)):
|
||||||
|
"""Erstellt eine Regel basierend auf Dokumenttyp - für einfaches UI"""
|
||||||
|
from ..modules.sorter import DOKUMENTTYPEN
|
||||||
|
|
||||||
|
typ_config = DOKUMENTTYPEN.get(data.dokumenttyp, DOKUMENTTYPEN["sonstiges"])
|
||||||
|
|
||||||
|
# Muster als Dict (keywords werden vom Sorter geparst)
|
||||||
|
muster = {"keywords": data.keywords}
|
||||||
|
|
||||||
|
# Extraktion (nur Firma wenn angegeben)
|
||||||
|
extraktion = {}
|
||||||
|
if data.firma:
|
||||||
|
extraktion["firma"] = {"wert": data.firma}
|
||||||
|
|
||||||
|
regel = SortierRegel(
|
||||||
|
name=data.name,
|
||||||
|
prioritaet=data.prioritaet,
|
||||||
|
aktiv=True,
|
||||||
|
muster=muster,
|
||||||
|
extraktion=extraktion,
|
||||||
|
schema=typ_config["schema"],
|
||||||
|
unterordner=data.unterordner or typ_config["unterordner"]
|
||||||
|
)
|
||||||
|
|
||||||
|
db.add(regel)
|
||||||
|
db.commit()
|
||||||
|
db.refresh(regel)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"id": regel.id,
|
||||||
|
"name": regel.name,
|
||||||
|
"dokumenttyp": data.dokumenttyp,
|
||||||
|
"keywords": data.keywords,
|
||||||
|
"schema": regel.schema
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class ExtraktionTestRequest(BaseModel):
|
||||||
|
text: str
|
||||||
|
dateiname: Optional[str] = "test.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/extraktion/test")
|
||||||
|
def teste_extraktion(data: ExtraktionTestRequest):
|
||||||
|
"""Testet die automatische Extraktion auf einem Text"""
|
||||||
|
from ..modules.extraktoren import extrahiere_alle_felder, baue_dateiname
|
||||||
|
|
||||||
|
dokument_info = {
|
||||||
|
"original_name": data.dateiname,
|
||||||
|
"absender": ""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Felder extrahieren
|
||||||
|
felder = extrahiere_alle_felder(data.text, dokument_info)
|
||||||
|
|
||||||
|
# Beispiel-Dateinamen für verschiedene Typen generieren
|
||||||
|
beispiele = {}
|
||||||
|
from ..modules.sorter import DOKUMENTTYPEN
|
||||||
|
for typ_id, typ_config in DOKUMENTTYPEN.items():
|
||||||
|
beispiele[typ_id] = baue_dateiname(typ_config["schema"], felder, ".pdf")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"extrahiert": felder,
|
||||||
|
"beispiel_dateinamen": beispiele
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/regeln/{id}/vorschau")
|
||||||
|
def regel_vorschau(id: int, data: ExtraktionTestRequest, db: Session = Depends(get_db)):
|
||||||
|
"""Zeigt Vorschau wie eine Regel auf einen Text angewendet würde"""
|
||||||
|
regel = db.query(SortierRegel).filter(SortierRegel.id == id).first()
|
||||||
|
if not regel:
|
||||||
|
raise HTTPException(status_code=404, detail="Regel nicht gefunden")
|
||||||
|
|
||||||
|
from ..modules.sorter import Sorter
|
||||||
|
|
||||||
|
sorter = Sorter([{
|
||||||
|
"id": regel.id,
|
||||||
|
"name": regel.name,
|
||||||
|
"prioritaet": regel.prioritaet,
|
||||||
|
"aktiv": True,
|
||||||
|
"muster": regel.muster,
|
||||||
|
"extraktion": regel.extraktion,
|
||||||
|
"schema": regel.schema,
|
||||||
|
"unterordner": regel.unterordner
|
||||||
|
}])
|
||||||
|
|
||||||
|
dokument_info = {
|
||||||
|
"text": data.text,
|
||||||
|
"original_name": data.dateiname or "test.pdf",
|
||||||
|
"absender": ""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Prüfen ob Regel matched
|
||||||
|
passende_regel = sorter.finde_passende_regel(dokument_info)
|
||||||
|
|
||||||
|
if not passende_regel:
|
||||||
|
return {
|
||||||
|
"matched": False,
|
||||||
|
"grund": "Keywords nicht gefunden"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Felder extrahieren
|
||||||
|
felder = sorter.extrahiere_felder(passende_regel, dokument_info)
|
||||||
|
|
||||||
|
# Dateiname generieren
|
||||||
|
dateiname = sorter.generiere_dateinamen(passende_regel, felder)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"matched": True,
|
||||||
|
"extrahiert": felder,
|
||||||
|
"dateiname": dateiname,
|
||||||
|
"unterordner": passende_regel.get("unterordner")
|
||||||
|
}
|
||||||
1
backend/app/services/__init__.py
Normal file
1
backend/app/services/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
# Services
|
||||||
BIN
backend/app/services/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
backend/app/services/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
Binary file not shown.
360
backend/app/services/pipeline_service.py
Normal file
360
backend/app/services/pipeline_service.py
Normal file
|
|
@ -0,0 +1,360 @@
|
||||||
|
"""
|
||||||
|
Pipeline Service
|
||||||
|
Orchestriert die gesamte Dokumentenverarbeitung
|
||||||
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from ..models import Pipeline, MailConfig, SortierRegel, Dokument, VerarbeitungsLog
|
||||||
|
from ..modules.mail_fetcher import MailFetcher
|
||||||
|
from ..modules.pdf_processor import PDFProcessor
|
||||||
|
from ..modules.sorter import Sorter
|
||||||
|
from ..config import INBOX_DIR, PROCESSED_DIR, ZUGFERD_DIR
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineService:
|
||||||
|
"""Führt die komplette Pipeline-Verarbeitung durch"""
|
||||||
|
|
||||||
|
def __init__(self, db: Session):
|
||||||
|
self.db = db
|
||||||
|
self.pdf_processor = PDFProcessor()
|
||||||
|
|
||||||
|
def verarbeite_pipeline(self, pipeline_id: int) -> Dict:
|
||||||
|
"""
|
||||||
|
Führt alle Schritte einer Pipeline aus
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mit Statistiken und Ergebnissen
|
||||||
|
"""
|
||||||
|
pipeline = self.db.query(Pipeline).filter(Pipeline.id == pipeline_id).first()
|
||||||
|
if not pipeline:
|
||||||
|
return {"fehler": f"Pipeline {pipeline_id} nicht gefunden"}
|
||||||
|
|
||||||
|
if not pipeline.aktiv:
|
||||||
|
return {"fehler": f"Pipeline {pipeline.name} ist deaktiviert"}
|
||||||
|
|
||||||
|
ergebnis = {
|
||||||
|
"pipeline": pipeline.name,
|
||||||
|
"gestartet": datetime.now().isoformat(),
|
||||||
|
"mails_abgerufen": 0,
|
||||||
|
"attachments": 0,
|
||||||
|
"verarbeitet": 0,
|
||||||
|
"zugferd": 0,
|
||||||
|
"ocr": 0,
|
||||||
|
"sortiert": 0,
|
||||||
|
"fehler": []
|
||||||
|
}
|
||||||
|
|
||||||
|
# 1. Mails abrufen
|
||||||
|
inbox_pfad = INBOX_DIR / f"pipeline_{pipeline_id}"
|
||||||
|
inbox_pfad.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
for mail_config in pipeline.mail_configs:
|
||||||
|
if not mail_config.aktiv:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
attachments = self._rufe_mails_ab(mail_config, inbox_pfad)
|
||||||
|
ergebnis["attachments"] += len(attachments)
|
||||||
|
|
||||||
|
# Dokumente in DB anlegen
|
||||||
|
for att in attachments:
|
||||||
|
dokument = Dokument(
|
||||||
|
pipeline_id=pipeline_id,
|
||||||
|
original_name=att["original_name"],
|
||||||
|
original_pfad=att["pfad"],
|
||||||
|
status="neu",
|
||||||
|
extrahierte_daten={
|
||||||
|
"absender": att.get("absender"),
|
||||||
|
"betreff": att.get("betreff"),
|
||||||
|
"mail_datum": att.get("datum")
|
||||||
|
}
|
||||||
|
)
|
||||||
|
self.db.add(dokument)
|
||||||
|
self._log(dokument.id, "mail_abruf", "erfolg", att)
|
||||||
|
|
||||||
|
# Letzten Abruf aktualisieren
|
||||||
|
mail_config.letzter_abruf = datetime.utcnow()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
ergebnis["fehler"].append(f"Mail-Abruf {mail_config.name}: {e}")
|
||||||
|
logger.error(f"Mail-Abruf Fehler: {e}")
|
||||||
|
|
||||||
|
self.db.commit()
|
||||||
|
|
||||||
|
# 2. PDFs verarbeiten
|
||||||
|
neue_dokumente = self.db.query(Dokument).filter(
|
||||||
|
Dokument.pipeline_id == pipeline_id,
|
||||||
|
Dokument.status == "neu"
|
||||||
|
).all()
|
||||||
|
|
||||||
|
for dokument in neue_dokumente:
|
||||||
|
try:
|
||||||
|
self._verarbeite_dokument(dokument, pipeline)
|
||||||
|
ergebnis["verarbeitet"] += 1
|
||||||
|
|
||||||
|
if dokument.ist_zugferd:
|
||||||
|
ergebnis["zugferd"] += 1
|
||||||
|
if dokument.ocr_durchgefuehrt:
|
||||||
|
ergebnis["ocr"] += 1
|
||||||
|
if dokument.status == "sortiert":
|
||||||
|
ergebnis["sortiert"] += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
dokument.status = "fehler"
|
||||||
|
dokument.fehler_meldung = str(e)
|
||||||
|
ergebnis["fehler"].append(f"Verarbeitung {dokument.original_name}: {e}")
|
||||||
|
logger.error(f"Verarbeitungs-Fehler: {e}")
|
||||||
|
|
||||||
|
self.db.commit()
|
||||||
|
|
||||||
|
ergebnis["beendet"] = datetime.now().isoformat()
|
||||||
|
return ergebnis
|
||||||
|
|
||||||
|
def _rufe_mails_ab(self, mail_config: MailConfig, ziel_ordner: Path) -> List[Dict]:
|
||||||
|
"""Ruft Mails von einem Postfach ab"""
|
||||||
|
config = {
|
||||||
|
"imap_server": mail_config.imap_server,
|
||||||
|
"imap_port": mail_config.imap_port,
|
||||||
|
"email": mail_config.email,
|
||||||
|
"passwort": mail_config.passwort,
|
||||||
|
"ordner": mail_config.ordner,
|
||||||
|
"erlaubte_typen": mail_config.erlaubte_typen,
|
||||||
|
"max_groesse_mb": mail_config.max_groesse_mb
|
||||||
|
}
|
||||||
|
|
||||||
|
fetcher = MailFetcher(config)
|
||||||
|
try:
|
||||||
|
attachments = fetcher.fetch_attachments(ziel_ordner)
|
||||||
|
return attachments
|
||||||
|
finally:
|
||||||
|
fetcher.disconnect()
|
||||||
|
|
||||||
|
def _verarbeite_dokument(self, dokument: Dokument, pipeline: Pipeline):
|
||||||
|
"""Verarbeitet ein einzelnes Dokument"""
|
||||||
|
pfad = Path(dokument.original_pfad)
|
||||||
|
|
||||||
|
if not pfad.exists():
|
||||||
|
raise FileNotFoundError(f"Datei nicht gefunden: {pfad}")
|
||||||
|
|
||||||
|
# Nur PDFs verarbeiten
|
||||||
|
if pfad.suffix.lower() != ".pdf":
|
||||||
|
dokument.status = "uebersprungen"
|
||||||
|
self._log(dokument.id, "verarbeitung", "uebersprungen", {"grund": "Kein PDF"})
|
||||||
|
return
|
||||||
|
|
||||||
|
# PDF verarbeiten
|
||||||
|
pdf_ergebnis = self.pdf_processor.verarbeite(str(pfad))
|
||||||
|
|
||||||
|
if "fehler" in pdf_ergebnis:
|
||||||
|
raise Exception(pdf_ergebnis["fehler"])
|
||||||
|
|
||||||
|
dokument.ist_zugferd = pdf_ergebnis["ist_zugferd"]
|
||||||
|
dokument.hat_text = pdf_ergebnis["hat_text"]
|
||||||
|
dokument.ocr_durchgefuehrt = pdf_ergebnis["ocr_durchgefuehrt"]
|
||||||
|
|
||||||
|
# ZUGFeRD separat behandeln - NICHT umbenennen!
|
||||||
|
if dokument.ist_zugferd:
|
||||||
|
self._behandle_zugferd(dokument, pdf_ergebnis)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Sortieren
|
||||||
|
self._sortiere_dokument(dokument, pdf_ergebnis, pipeline)
|
||||||
|
|
||||||
|
def _behandle_zugferd(self, dokument: Dokument, pdf_ergebnis: Dict):
|
||||||
|
"""Behandelt ZUGFeRD-Rechnungen (werden nicht verändert)"""
|
||||||
|
# In separaten Ordner verschieben
|
||||||
|
ziel_dir = ZUGFERD_DIR / datetime.now().strftime("%Y-%m")
|
||||||
|
ziel_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
quell_pfad = Path(dokument.original_pfad)
|
||||||
|
ziel_pfad = ziel_dir / quell_pfad.name
|
||||||
|
|
||||||
|
# Eindeutigen Namen sicherstellen
|
||||||
|
counter = 1
|
||||||
|
while ziel_pfad.exists():
|
||||||
|
ziel_pfad = ziel_dir / f"{quell_pfad.stem}_{counter}{quell_pfad.suffix}"
|
||||||
|
counter += 1
|
||||||
|
|
||||||
|
# Verschieben (nicht umbenennen!)
|
||||||
|
import shutil
|
||||||
|
shutil.move(str(quell_pfad), str(ziel_pfad))
|
||||||
|
|
||||||
|
dokument.neuer_pfad = str(ziel_pfad)
|
||||||
|
dokument.neuer_name = ziel_pfad.name
|
||||||
|
dokument.status = "zugferd"
|
||||||
|
dokument.extrahierte_daten = {
|
||||||
|
**(dokument.extrahierte_daten or {}),
|
||||||
|
"zugferd_xml": pdf_ergebnis.get("zugferd_xml")
|
||||||
|
}
|
||||||
|
|
||||||
|
self._log(dokument.id, "zugferd", "erfolg", {
|
||||||
|
"ziel": str(ziel_pfad),
|
||||||
|
"hinweis": "ZUGFeRD wird nicht umbenannt"
|
||||||
|
})
|
||||||
|
|
||||||
|
def _sortiere_dokument(self, dokument: Dokument, pdf_ergebnis: Dict, pipeline: Pipeline):
|
||||||
|
"""Sortiert Dokument nach Regeln"""
|
||||||
|
# Regeln laden
|
||||||
|
regeln = self.db.query(SortierRegel).filter(
|
||||||
|
SortierRegel.pipeline_id == pipeline.id,
|
||||||
|
SortierRegel.aktiv == True
|
||||||
|
).order_by(SortierRegel.prioritaet).all()
|
||||||
|
|
||||||
|
# In Sorter-Format konvertieren
|
||||||
|
regeln_dicts = []
|
||||||
|
for r in regeln:
|
||||||
|
regeln_dicts.append({
|
||||||
|
"id": r.id,
|
||||||
|
"name": r.name,
|
||||||
|
"prioritaet": r.prioritaet,
|
||||||
|
"aktiv": r.aktiv,
|
||||||
|
"muster": r.muster,
|
||||||
|
"extraktion": r.extraktion,
|
||||||
|
"schema": r.schema,
|
||||||
|
"ziel_ordner": r.ziel_ordner
|
||||||
|
})
|
||||||
|
|
||||||
|
sorter = Sorter(regeln_dicts)
|
||||||
|
|
||||||
|
# Dokument-Info zusammenstellen
|
||||||
|
dokument_info = {
|
||||||
|
"text": pdf_ergebnis.get("text", ""),
|
||||||
|
"original_name": dokument.original_name,
|
||||||
|
"absender": (dokument.extrahierte_daten or {}).get("absender", ""),
|
||||||
|
"betreff": (dokument.extrahierte_daten or {}).get("betreff", "")
|
||||||
|
}
|
||||||
|
|
||||||
|
# Passende Regel finden
|
||||||
|
regel = sorter.finde_passende_regel(dokument_info)
|
||||||
|
|
||||||
|
if not regel:
|
||||||
|
dokument.status = "keine_regel"
|
||||||
|
self._log(dokument.id, "sortierung", "keine_regel", {})
|
||||||
|
return
|
||||||
|
|
||||||
|
# Felder extrahieren
|
||||||
|
extrahiert = sorter.extrahiere_felder(regel, dokument_info)
|
||||||
|
|
||||||
|
# Dateinamen generieren
|
||||||
|
neuer_name = sorter.generiere_dateinamen(regel, extrahiert)
|
||||||
|
|
||||||
|
# Zielordner bestimmen
|
||||||
|
ziel_ordner = regel.get("ziel_ordner") or str(PROCESSED_DIR / pipeline.name)
|
||||||
|
|
||||||
|
# Verschieben
|
||||||
|
neuer_pfad = sorter.verschiebe_datei(
|
||||||
|
dokument.original_pfad,
|
||||||
|
ziel_ordner,
|
||||||
|
neuer_name
|
||||||
|
)
|
||||||
|
|
||||||
|
# Dokument aktualisieren
|
||||||
|
dokument.neuer_name = neuer_name
|
||||||
|
dokument.neuer_pfad = neuer_pfad
|
||||||
|
dokument.extrahierte_daten = {
|
||||||
|
**(dokument.extrahierte_daten or {}),
|
||||||
|
"text_auszug": pdf_ergebnis.get("text", "")[:500],
|
||||||
|
**extrahiert
|
||||||
|
}
|
||||||
|
dokument.regel_id = regel.get("id")
|
||||||
|
dokument.status = "sortiert"
|
||||||
|
dokument.verarbeitet = datetime.utcnow()
|
||||||
|
|
||||||
|
self._log(dokument.id, "sortierung", "erfolg", {
|
||||||
|
"regel": regel.get("name"),
|
||||||
|
"neuer_name": neuer_name,
|
||||||
|
"ziel": neuer_pfad
|
||||||
|
})
|
||||||
|
|
||||||
|
def _log(self, dokument_id: int, schritt: str, status: str, details: Dict):
|
||||||
|
"""Erstellt Log-Eintrag"""
|
||||||
|
log = VerarbeitungsLog(
|
||||||
|
dokument_id=dokument_id,
|
||||||
|
schritt=schritt,
|
||||||
|
status=status,
|
||||||
|
details=details
|
||||||
|
)
|
||||||
|
self.db.add(log)
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineManager:
|
||||||
|
"""Verwaltet Pipelines (CRUD-Operationen)"""
|
||||||
|
|
||||||
|
def __init__(self, db: Session):
|
||||||
|
self.db = db
|
||||||
|
|
||||||
|
def erstelle_pipeline(self, name: str, beschreibung: str = "") -> Pipeline:
|
||||||
|
"""Erstellt neue Pipeline"""
|
||||||
|
pipeline = Pipeline(name=name, beschreibung=beschreibung)
|
||||||
|
self.db.add(pipeline)
|
||||||
|
self.db.commit()
|
||||||
|
self.db.refresh(pipeline)
|
||||||
|
return pipeline
|
||||||
|
|
||||||
|
def hole_alle_pipelines(self) -> List[Pipeline]:
|
||||||
|
"""Gibt alle Pipelines zurück"""
|
||||||
|
return self.db.query(Pipeline).all()
|
||||||
|
|
||||||
|
def hole_pipeline(self, pipeline_id: int) -> Optional[Pipeline]:
|
||||||
|
"""Gibt eine Pipeline zurück"""
|
||||||
|
return self.db.query(Pipeline).filter(Pipeline.id == pipeline_id).first()
|
||||||
|
|
||||||
|
def fuege_mail_config_hinzu(self, pipeline_id: int, config: Dict) -> MailConfig:
|
||||||
|
"""Fügt Mail-Konfiguration zu Pipeline hinzu"""
|
||||||
|
mail_config = MailConfig(
|
||||||
|
pipeline_id=pipeline_id,
|
||||||
|
name=config.get("name", "Unbenannt"),
|
||||||
|
imap_server=config["imap_server"],
|
||||||
|
imap_port=config.get("imap_port", 993),
|
||||||
|
email=config["email"],
|
||||||
|
passwort=config["passwort"],
|
||||||
|
ordner=config.get("ordner", "INBOX"),
|
||||||
|
erlaubte_typen=config.get("erlaubte_typen", [".pdf"]),
|
||||||
|
max_groesse_mb=config.get("max_groesse_mb", 25)
|
||||||
|
)
|
||||||
|
self.db.add(mail_config)
|
||||||
|
self.db.commit()
|
||||||
|
self.db.refresh(mail_config)
|
||||||
|
return mail_config
|
||||||
|
|
||||||
|
def fuege_regel_hinzu(self, pipeline_id: int, regel: Dict) -> SortierRegel:
|
||||||
|
"""Fügt Sortier-Regel zu Pipeline hinzu"""
|
||||||
|
sortier_regel = SortierRegel(
|
||||||
|
pipeline_id=pipeline_id,
|
||||||
|
name=regel["name"],
|
||||||
|
prioritaet=regel.get("prioritaet", 100),
|
||||||
|
muster=regel.get("muster", {}),
|
||||||
|
extraktion=regel.get("extraktion", {}),
|
||||||
|
schema=regel.get("schema", "{datum} - Dokument.pdf"),
|
||||||
|
ziel_ordner=regel.get("ziel_ordner")
|
||||||
|
)
|
||||||
|
self.db.add(sortier_regel)
|
||||||
|
self.db.commit()
|
||||||
|
self.db.refresh(sortier_regel)
|
||||||
|
return sortier_regel
|
||||||
|
|
||||||
|
def teste_regel(self, regel: Dict, text: str) -> Dict:
|
||||||
|
"""Testet eine Regel gegen einen Text"""
|
||||||
|
sorter = Sorter([regel])
|
||||||
|
dokument_info = {"text": text, "original_name": "test.pdf", "absender": ""}
|
||||||
|
|
||||||
|
passend = sorter.finde_passende_regel(dokument_info) is not None
|
||||||
|
|
||||||
|
extrahiert = {}
|
||||||
|
dateiname = ""
|
||||||
|
if passend:
|
||||||
|
extrahiert = sorter.extrahiere_felder(regel, dokument_info)
|
||||||
|
dateiname = sorter.generiere_dateinamen(regel, extrahiert)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"regel_passt": passend,
|
||||||
|
"extrahierte_felder": extrahiert,
|
||||||
|
"vorgeschlagener_name": dateiname
|
||||||
|
}
|
||||||
1
backend/app/utils/__init__.py
Normal file
1
backend/app/utils/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
# Utilities
|
||||||
20
backend/requirements.txt
Normal file
20
backend/requirements.txt
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
# Web Framework
|
||||||
|
fastapi==0.109.2
|
||||||
|
uvicorn[standard]==0.27.1
|
||||||
|
python-multipart==0.0.9
|
||||||
|
jinja2==3.1.3
|
||||||
|
|
||||||
|
# Database
|
||||||
|
sqlalchemy==2.0.25
|
||||||
|
aiosqlite==0.19.0
|
||||||
|
|
||||||
|
# PDF Processing
|
||||||
|
pypdf==4.0.1
|
||||||
|
pdfplumber==0.10.4
|
||||||
|
|
||||||
|
# ZUGFeRD
|
||||||
|
factur-x==3.0
|
||||||
|
|
||||||
|
# Utilities
|
||||||
|
pydantic==2.6.1
|
||||||
|
python-dotenv==1.0.1
|
||||||
BIN
data/dateiverwaltung.db
Normal file
BIN
data/dateiverwaltung.db
Normal file
Binary file not shown.
Binary file not shown.
BIN
data/inbox/20260201_201750_Verkaufsschild.pdf
Normal file
BIN
data/inbox/20260201_201750_Verkaufsschild.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_201751_Verkaufsschild.pdf
Normal file
BIN
data/inbox/20260201_201751_Verkaufsschild.pdf
Normal file
Binary file not shown.
8931
data/inbox/20260201_201754_AG_6286259.pdf
Normal file
8931
data/inbox/20260201_201754_AG_6286259.pdf
Normal file
File diff suppressed because it is too large
Load diff
BIN
data/inbox/20260201_201754_ZPlanReport-8APLYFEA84.pdf
Normal file
BIN
data/inbox/20260201_201754_ZPlanReport-8APLYFEA84.pdf
Normal file
Binary file not shown.
Binary file not shown.
BIN
data/inbox/20260201_201802_Rechnung-RG259525.pdf
Normal file
BIN
data/inbox/20260201_201802_Rechnung-RG259525.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_201803_11009319383778_20251216_154534793.pdf
Normal file
BIN
data/inbox/20260201_201803_11009319383778_20251216_154534793.pdf
Normal file
Binary file not shown.
255
data/inbox/20260201_201810_LA11423.pdf
Normal file
255
data/inbox/20260201_201810_LA11423.pdf
Normal file
|
|
@ -0,0 +1,255 @@
|
||||||
|
%PDF-1.4
|
||||||
|
1 0 obj
|
||||||
|
<<
|
||||||
|
/Creator (Oracle11gR1 AS Reports Services)
|
||||||
|
/CreationDate (D:20251218185644)
|
||||||
|
/ModDate (D:20251218185644)
|
||||||
|
/Producer (Oracle PDF driver)
|
||||||
|
/Title (DUMMY)
|
||||||
|
/Author (Oracle Reports)
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
5 0 obj
|
||||||
|
<</Length 6 0 R
|
||||||
|
/Filter [/ASCII85Decode /FlateDecode]
|
||||||
|
>>
|
||||||
|
stream
|
||||||
|
Gatm9bAQ'8&D\#X5KqP-_OLEHH0JTC4)Q*Q;+lIj(0sbf28t<G)a<8uhq(ZG2e:a4bD.K7/@)2i
|
||||||
|
Ur[oFbflI[3R`_(\Fi#1nG\)bGY=AfIIHnpJA]\,'T41A_h^`"8e;>(`lA-]^A#bDKP"+LS-UGN
|
||||||
|
3$rjK/Sq*N]rUf/0nMguf%7Y1nhgXq$+><P]o1,GYXYNL#lPM:B&KE[J%/jVjlYHa4/_-s="N>B
|
||||||
|
b*`*?,hD%+7@$E<D)V@)SST"Wb\2X/Kp$6nIe:;e8;V("MI:Z]i!MbBONs'QM!QK)!_<pPdss-4
|
||||||
|
n]6[tMr^"_TnkQb/=(oP!g6+rOBfGS75%O]<D6#[VF]HojU<]J8Rs5rY`#bfT9'jj[]ip(F3XRA
|
||||||
|
#gY@21Oo`KejXTsNWbTt$,J62#6GJfE]dto$<CQg@op1h]3kC:cB9F/[Jd]X:WA8h4"/-6BW/m/
|
||||||
|
V4'4\JI<*n]!A]njX?h_d0U2U!NSJA$2cu(g?*ikk7;G+3H#*H:,Tm@2gSA-nB#;@\8,rh\*0]D
|
||||||
|
NQmP<)A>HL5\_d&b>j::V#)sE.@5ZL)r!3XfQO[)#-U/k"V)'"aa(+@^FPcg2Kpn1Htt6WmRhj7
|
||||||
|
.9,dK/ED&*L(4Fri%KJ\!G'(*g-483Mt%kRc22P(d@$cMh`oZ:em4OaGEZ)iW#kF%:k$ZH*:MXp
|
||||||
|
E?:r=iH-Fk\l9Y5m-D3;6nuS,&"C4K/_lZHT9c6s?.c/CA[8XZ&ujP*^nt%'Z_m9#1#G'.EC+&I
|
||||||
|
k!bn"e]5_1II*Y^>TV,jJ&1s6=S]V-_rO%.>.]3h*:(EE/h"qn.F?u#IBg1sSIXYQ4.OI4W==M8
|
||||||
|
-ZacMn+RRJ^R<OjkG>S[:X5&gE]%+#_2)@;lfQ+@32fEe1geH7QNfNSBn9K6o@/d9+uU/`9kB,D
|
||||||
|
,0p_hX.lg-CsobGi_0$m*d+9G](Sk+B5k3CdX$]9Mj)cQE(i"7.&=<I2I:4[*[Z"tW<XW(a"T.&
|
||||||
|
FZ<kGATBf(9M@l<C^8(mP0&eVIi-PqIRs)lD7fB0]W!U*HTb4(Aemka-9e3B@4'qZXR>h]b#=nj
|
||||||
|
@'A:Q6)*kNe;4s=K1W370C;rqT`>&m~>
|
||||||
|
endstream
|
||||||
|
endobj
|
||||||
|
6 0 obj
|
||||||
|
1110
|
||||||
|
endobj
|
||||||
|
7 0 obj
|
||||||
|
<</Type /Font
|
||||||
|
/Name /F0
|
||||||
|
/Subtype /Type1
|
||||||
|
/Encoding /WinAnsiEncoding
|
||||||
|
/BaseFont /#43#6F#75#72#69#65#72
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
8 0 obj
|
||||||
|
<</Type /Font
|
||||||
|
/Name /F1
|
||||||
|
/Subtype /Type1
|
||||||
|
/Encoding /WinAnsiEncoding
|
||||||
|
/BaseFont /#41#72#69#61#6C#2C#42#6F#6C#64
|
||||||
|
/FirstChar 24
|
||||||
|
/LastChar 255
|
||||||
|
/Widths 9 0 R
|
||||||
|
/FontDescriptor 10 0 R
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
9 0 obj
|
||||||
|
[747 747 747 747 747 747 747 747 280 320
|
||||||
|
480 560 560 853 720 240 333 333 387 587
|
||||||
|
280 333 280 280 560 560 560 560 560 560
|
||||||
|
560 560 560 560 320 320 587 587 587 613
|
||||||
|
973 707 720 720 720 667 613 773 720 280
|
||||||
|
560 720 613 840 720 773 667 773 720 667
|
||||||
|
613 720 667 947 667 667 613 333 280 333
|
||||||
|
587 560 333 560 613 560 613 560 333 613
|
||||||
|
613 280 280 560 280 893 613 613 613 613
|
||||||
|
387 560 333 613 547 800 560 507 507 387
|
||||||
|
280 387 587 747 560 0 280 560 507 1000
|
||||||
|
560 560 333 987 667 333 1000 0 613 0
|
||||||
|
0 280 280 507 507 347 560 1000 320 1000
|
||||||
|
560 333 947 0 507 667 280 320 560 560
|
||||||
|
560 560 280 560 333 733 373 560 587 333
|
||||||
|
733 547 400 547 333 333 333 573 560 333
|
||||||
|
333 333 360 560 840 840 840 613 707 707
|
||||||
|
707 707 707 707 1000 720 667 667 667 667
|
||||||
|
280 280 280 280 720 720 773 773 773 773
|
||||||
|
773 587 773 720 720 720 720 667 667 613
|
||||||
|
560 560 560 560 560 560 893 560 560 560
|
||||||
|
560 560 280 280 280 280 613 613 613 613
|
||||||
|
613 613 613 547 613 613 613 613 613 507
|
||||||
|
613 507 ]
|
||||||
|
endobj
|
||||||
|
10 0 obj
|
||||||
|
<<
|
||||||
|
/Type /FontDescriptor
|
||||||
|
/FontName /#41#72#69#61#6C#2C#42#6F#6C#64
|
||||||
|
/Ascent 933
|
||||||
|
/CapHeight 760
|
||||||
|
/Descent -240
|
||||||
|
/Flags 32
|
||||||
|
/FontBBox [-664 -324 2000 1039]
|
||||||
|
/ItalicAngle 0
|
||||||
|
/StemV 0
|
||||||
|
/AvgWidth 533
|
||||||
|
/MaxWidth 1000
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
11 0 obj
|
||||||
|
<</Type /Font
|
||||||
|
/Name /F2
|
||||||
|
/Subtype /Type1
|
||||||
|
/Encoding /WinAnsiEncoding
|
||||||
|
/BaseFont /#41#72#69#61#6C
|
||||||
|
/FirstChar 24
|
||||||
|
/LastChar 255
|
||||||
|
/Widths 12 0 R
|
||||||
|
/FontDescriptor 13 0 R
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
12 0 obj
|
||||||
|
[747 747 747 747 747 747 747 747 280 307
|
||||||
|
360 560 560 893 667 187 333 333 387 587
|
||||||
|
280 333 280 280 560 560 560 560 560 560
|
||||||
|
560 560 560 560 280 280 587 587 587 560
|
||||||
|
1013 667 667 720 720 667 613 773 720 280
|
||||||
|
507 667 560 813 720 773 667 773 720 667
|
||||||
|
600 720 667 1000 653 653 613 280 280 280
|
||||||
|
453 560 333 560 560 507 560 560 280 560
|
||||||
|
560 227 227 507 227 840 560 560 560 560
|
||||||
|
333 507 280 560 493 707 480 493 493 333
|
||||||
|
267 333 587 747 560 0 227 560 333 1000
|
||||||
|
560 560 333 1013 667 333 1000 0 613 0
|
||||||
|
0 227 227 333 333 347 560 1000 320 1000
|
||||||
|
507 333 947 0 493 653 280 307 560 560
|
||||||
|
560 560 267 560 333 733 373 560 587 333
|
||||||
|
733 547 400 547 333 333 333 573 533 333
|
||||||
|
333 333 360 560 840 840 840 613 667 667
|
||||||
|
667 667 667 667 1000 720 667 667 667 667
|
||||||
|
280 280 280 280 720 720 773 773 773 773
|
||||||
|
773 587 773 720 720 720 720 653 667 613
|
||||||
|
560 560 560 560 560 560 893 507 560 560
|
||||||
|
560 560 280 280 280 280 560 560 560 560
|
||||||
|
560 560 560 547 613 560 560 560 560 493
|
||||||
|
560 493 ]
|
||||||
|
endobj
|
||||||
|
13 0 obj
|
||||||
|
<<
|
||||||
|
/Type /FontDescriptor
|
||||||
|
/FontName /#41#72#69#61#6C
|
||||||
|
/Ascent 893
|
||||||
|
/CapHeight 787
|
||||||
|
/Descent -213
|
||||||
|
/Flags 32
|
||||||
|
/FontBBox [-664 -324 2000 1039]
|
||||||
|
/ItalicAngle 0
|
||||||
|
/StemV 0
|
||||||
|
/AvgWidth 520
|
||||||
|
/MaxWidth 1013
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
14 0 obj
|
||||||
|
<</Type /Font
|
||||||
|
/Name /F3
|
||||||
|
/Subtype /Type1
|
||||||
|
/Encoding /WinAnsiEncoding
|
||||||
|
/BaseFont /#41#72#69#61#6C#2C#42#6F#6C#64#49#74#61#6C#69#63
|
||||||
|
/FirstChar 24
|
||||||
|
/LastChar 255
|
||||||
|
/Widths 15 0 R
|
||||||
|
/FontDescriptor 16 0 R
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
15 0 obj
|
||||||
|
[753 753 753 753 753 753 753 753 284 338
|
||||||
|
480 556 556 894 720 229 338 327 393 589
|
||||||
|
284 338 284 284 556 556 556 556 556 556
|
||||||
|
556 556 556 556 338 338 589 589 589 611
|
||||||
|
982 720 720 720 720 665 611 785 720 284
|
||||||
|
556 720 611 840 720 785 665 785 720 665
|
||||||
|
611 720 665 949 665 665 611 338 284 360
|
||||||
|
589 556 338 556 611 556 611 556 338 611
|
||||||
|
611 284 284 556 284 894 611 611 611 611
|
||||||
|
393 556 338 611 556 785 556 556 502 393
|
||||||
|
284 393 589 753 556 0 284 556 502 1004
|
||||||
|
556 556 338 1025 665 338 1004 0 611 0
|
||||||
|
0 273 284 502 502 338 556 1025 338 1004
|
||||||
|
556 338 949 0 502 665 284 338 578 556
|
||||||
|
556 556 284 556 338 742 371 556 589 338
|
||||||
|
742 556 404 556 338 338 338 578 556 338
|
||||||
|
338 338 371 556 840 840 840 611 720 720
|
||||||
|
720 720 720 720 1004 720 665 665 665 665
|
||||||
|
284 284 284 284 720 720 785 785 785 785
|
||||||
|
785 589 785 720 720 720 720 665 665 611
|
||||||
|
556 556 556 556 556 556 894 556 556 556
|
||||||
|
556 556 284 284 284 284 611 611 611 611
|
||||||
|
611 611 611 556 611 611 611 611 611 556
|
||||||
|
611 556 ]
|
||||||
|
endobj
|
||||||
|
16 0 obj
|
||||||
|
<<
|
||||||
|
/Type /FontDescriptor
|
||||||
|
/FontName /#41#72#69#61#6C#2C#42#6F#6C#64#49#74#61#6C#69#63
|
||||||
|
/Ascent 927
|
||||||
|
/CapHeight 785
|
||||||
|
/Descent -218
|
||||||
|
/Flags 96
|
||||||
|
/FontBBox [-664 -324 2000 1039]
|
||||||
|
/ItalicAngle -11
|
||||||
|
/StemV 0
|
||||||
|
/AvgWidth 545
|
||||||
|
/MaxWidth 1025
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
4 0 obj
|
||||||
|
<<
|
||||||
|
/Type /Page
|
||||||
|
/MediaBox [0 0 567 822]
|
||||||
|
/Parent 3 0 R
|
||||||
|
/Resources <</ProcSet [ /PDF /Text ] /Font << /F0 7 0 R /F1 8 0 R /F2 11 0 R /F3 14 0 R >> >>
|
||||||
|
/Contents 5 0 R
|
||||||
|
/CropBox [0 0 567 822]
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
2 0 obj
|
||||||
|
<<
|
||||||
|
/Type /Catalog
|
||||||
|
/Pages 3 0 R
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
3 0 obj
|
||||||
|
<<
|
||||||
|
/Type /Pages
|
||||||
|
/Kids [
|
||||||
|
4 0 R ]
|
||||||
|
/Count 1
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
xref
|
||||||
|
0 17
|
||||||
|
0000000000 65535 f
|
||||||
|
0000000010 00000 n
|
||||||
|
0000006097 00000 n
|
||||||
|
0000006152 00000 n
|
||||||
|
0000005883 00000 n
|
||||||
|
0000000215 00000 n
|
||||||
|
0000001423 00000 n
|
||||||
|
0000001446 00000 n
|
||||||
|
0000001572 00000 n
|
||||||
|
0000001776 00000 n
|
||||||
|
0000002766 00000 n
|
||||||
|
0000003003 00000 n
|
||||||
|
0000003194 00000 n
|
||||||
|
0000004188 00000 n
|
||||||
|
0000004410 00000 n
|
||||||
|
0000004634 00000 n
|
||||||
|
0000005626 00000 n
|
||||||
|
trailer
|
||||||
|
<<
|
||||||
|
/Size 17
|
||||||
|
/Root 2 0 R
|
||||||
|
/Info 1 0 R
|
||||||
|
>>
|
||||||
|
startxref
|
||||||
|
6220
|
||||||
|
%%EOF
|
||||||
BIN
data/inbox/20260201_201812_11009319383778_20251219_001558979.pdf
Normal file
BIN
data/inbox/20260201_201812_11009319383778_20251219_001558979.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_201813_Wir handeln_Bilanz 2025.pdf
Normal file
BIN
data/inbox/20260201_201813_Wir handeln_Bilanz 2025.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_201828_Invoice-CG2CLMAP-0002.pdf
Normal file
BIN
data/inbox/20260201_201828_Invoice-CG2CLMAP-0002.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_201828_Receipt-2682-7632-6822.pdf
Normal file
BIN
data/inbox/20260201_201828_Receipt-2682-7632-6822.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
data/inbox/20260201_201835_Übersicht der Termine 2026.pdf
Normal file
BIN
data/inbox/20260201_201835_Übersicht der Termine 2026.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load diff
Binary file not shown.
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
Binary file not shown.
Binary file not shown.
BIN
data/inbox/20260201_201902_AGBs.pdf
Normal file
BIN
data/inbox/20260201_201902_AGBs.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_201903_AGB.pdf
Normal file
BIN
data/inbox/20260201_201903_AGB.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
data/inbox/20260201_201915_AGB.pdf
Normal file
BIN
data/inbox/20260201_201915_AGB.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_201915_Widerrufsrecht.pdf
Normal file
BIN
data/inbox/20260201_201915_Widerrufsrecht.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_201917_AGB.pdf
Normal file
BIN
data/inbox/20260201_201917_AGB.pdf
Normal file
Binary file not shown.
2162
data/inbox/20260201_201919_Rechnung_51584264_24587215.pdf
Normal file
2162
data/inbox/20260201_201919_Rechnung_51584264_24587215.pdf
Normal file
File diff suppressed because it is too large
Load diff
2168
data/inbox/20260201_201924_Rechnung_51598897_24602125.pdf
Normal file
2168
data/inbox/20260201_201924_Rechnung_51598897_24602125.pdf
Normal file
File diff suppressed because it is too large
Load diff
3443
data/inbox/20260201_201929_Rechnung_51603127_24606487.pdf
Normal file
3443
data/inbox/20260201_201929_Rechnung_51603127_24606487.pdf
Normal file
File diff suppressed because it is too large
Load diff
1893
data/inbox/20260201_201931_Rechnung_51606153_24609189.pdf
Normal file
1893
data/inbox/20260201_201931_Rechnung_51606153_24609189.pdf
Normal file
File diff suppressed because it is too large
Load diff
3861
data/inbox/20260201_201931_Rechnung_51606334_24609758.pdf
Normal file
3861
data/inbox/20260201_201931_Rechnung_51606334_24609758.pdf
Normal file
File diff suppressed because it is too large
Load diff
3443
data/inbox/20260201_201931_Rechnung_51606351_24609779.pdf
Normal file
3443
data/inbox/20260201_201931_Rechnung_51606351_24609779.pdf
Normal file
File diff suppressed because it is too large
Load diff
2295
data/inbox/20260201_201934_Rechnung_51610568_24613960.pdf
Normal file
2295
data/inbox/20260201_201934_Rechnung_51610568_24613960.pdf
Normal file
File diff suppressed because it is too large
Load diff
2434
data/inbox/20260201_201934_Rechnung_51610653_24613932.pdf
Normal file
2434
data/inbox/20260201_201934_Rechnung_51610653_24613932.pdf
Normal file
File diff suppressed because it is too large
Load diff
1763
data/inbox/20260201_201935_Rechnung_51610817_24612890.pdf
Normal file
1763
data/inbox/20260201_201935_Rechnung_51610817_24612890.pdf
Normal file
File diff suppressed because it is too large
Load diff
1899
data/inbox/20260201_201936_Rechnung_51616584_24620128.pdf
Normal file
1899
data/inbox/20260201_201936_Rechnung_51616584_24620128.pdf
Normal file
File diff suppressed because it is too large
Load diff
1627
data/inbox/20260201_201937_Rechnung_51617273_24620723.pdf
Normal file
1627
data/inbox/20260201_201937_Rechnung_51617273_24620723.pdf
Normal file
File diff suppressed because it is too large
Load diff
2165
data/inbox/20260201_201939_Rechnung_51628732_24632183.pdf
Normal file
2165
data/inbox/20260201_201939_Rechnung_51628732_24632183.pdf
Normal file
File diff suppressed because it is too large
Load diff
2694
data/inbox/20260201_201941_Rechnung_51635518_24638747.pdf
Normal file
2694
data/inbox/20260201_201941_Rechnung_51635518_24638747.pdf
Normal file
File diff suppressed because it is too large
Load diff
3044
data/inbox/20260201_201946_Rechnung_51646944_24650609.pdf
Normal file
3044
data/inbox/20260201_201946_Rechnung_51646944_24650609.pdf
Normal file
File diff suppressed because it is too large
Load diff
2159
data/inbox/20260201_201947_Rechnung_51648321_24651995.pdf
Normal file
2159
data/inbox/20260201_201947_Rechnung_51648321_24651995.pdf
Normal file
File diff suppressed because it is too large
Load diff
BIN
data/inbox/20260201_201949_agb_doc.pdf
Normal file
BIN
data/inbox/20260201_201949_agb_doc.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_201949_widerruf_doc.pdf
Normal file
BIN
data/inbox/20260201_201949_widerruf_doc.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_201950_1_VAT Document MCWW3C9.pdf
Normal file
BIN
data/inbox/20260201_201950_1_VAT Document MCWW3C9.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_201950_VAT Document MCWW3C9.pdf
Normal file
BIN
data/inbox/20260201_201950_VAT Document MCWW3C9.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_201952_Rechnung_12403.pdf
Normal file
BIN
data/inbox/20260201_201952_Rechnung_12403.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_201953_Rechnung RE-2024-32164.pdf
Normal file
BIN
data/inbox/20260201_201953_Rechnung RE-2024-32164.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_201957_Rechnung_V-23105.pdf
Normal file
BIN
data/inbox/20260201_201957_Rechnung_V-23105.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_201958_Rechnung_V-23195.pdf
Normal file
BIN
data/inbox/20260201_201958_Rechnung_V-23195.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_201959_Rechnung_V-23241.pdf
Normal file
BIN
data/inbox/20260201_201959_Rechnung_V-23241.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_201959_Verkaufsschild.pdf
Normal file
BIN
data/inbox/20260201_201959_Verkaufsschild.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_202000_Verkaufsschild.pdf
Normal file
BIN
data/inbox/20260201_202000_Verkaufsschild.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_202006_Verkaufsschild.pdf
Normal file
BIN
data/inbox/20260201_202006_Verkaufsschild.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_202007_AGB.pdf
Normal file
BIN
data/inbox/20260201_202007_AGB.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_202007_Datenschutz.pdf
Normal file
BIN
data/inbox/20260201_202007_Datenschutz.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_202007_Muster-Widerrufsformular.pdf
Normal file
BIN
data/inbox/20260201_202007_Muster-Widerrufsformular.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_202007_Widerrufsrecht.pdf
Normal file
BIN
data/inbox/20260201_202007_Widerrufsrecht.pdf
Normal file
Binary file not shown.
Binary file not shown.
BIN
data/inbox/20260201_202008_Rechnung Nr. 101567.pdf
Normal file
BIN
data/inbox/20260201_202008_Rechnung Nr. 101567.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_202009_HBA25194891.pdf
Normal file
BIN
data/inbox/20260201_202009_HBA25194891.pdf
Normal file
Binary file not shown.
BIN
data/inbox/20260201_202009_Rechnung Nr. 102901.pdf
Normal file
BIN
data/inbox/20260201_202009_Rechnung Nr. 102901.pdf
Normal file
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue