"""Bilgi indexleme — chunking + embedding + chunk yazımı.

Celery task içinden çağrılır (HTTP request içinde değil — ARCHITECTURE §8).
"""
import logging
from datetime import UTC, datetime

from fastapi import Depends

from app.models.knowledge import FlovyKnowledgeSource
from app.models.product import Product
from app.repositories.knowledge_repo import KnowledgeRepository
from app.services.rag.embedding_client import EmbeddingClient, get_embedding_client

logger = logging.getLogger("flovy")


def chunk_text(text: str, max_words: int = 350, overlap: int = 40) -> list[str]:
    """Paragraf bazlı chunking, kelime bazlı fallback + overlap.

    max_words ~ 500 token (Gemini input limiti 2048; chunk 500 token altı).
    """
    text = (text or "").strip()
    if not text:
        return []

    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    chunks: list[str] = []
    buffer: list[str] = []
    count = 0

    for para in paragraphs:
        words = para.split()
        if count + len(words) <= max_words:
            buffer.extend(words)
            count += len(words)
        else:
            if buffer:
                chunks.append(" ".join(buffer))
            # uzun tek paragraf → kelime bazlı böl
            if len(words) > max_words:
                start = 0
                while start < len(words):
                    chunks.append(" ".join(words[start : start + max_words]))
                    start += max_words - overlap
                buffer, count = [], 0
            else:
                buffer, count = list(words), len(words)

    if buffer:
        chunks.append(" ".join(buffer))
    return chunks


class KnowledgeIndexer:
    def __init__(
        self,
        repo: KnowledgeRepository = Depends(),
        embedding: EmbeddingClient = Depends(get_embedding_client),
    ):
        self.repo = repo
        self.embedding = embedding

    async def _index_source(
        self, source: FlovyKnowledgeSource, text: str, base_meta: dict
    ) -> int:
        await self.repo.update_source(source, status="indexing", error_message=None)
        await self.repo.delete_chunks_of_source(source.id, source.tenant_id)

        chunks = chunk_text(text)
        try:
            for chunk in chunks:
                vec = await self.embedding.embed(chunk, task="RETRIEVAL_DOCUMENT")
                await self.repo.add_chunk(
                    tenant_id=source.tenant_id,
                    source_id=source.id,
                    source_type=source.source_type,
                    content=chunk,
                    embedding=vec,
                    embedding_model="text-embedding-004",
                    meta=base_meta,
                )
        except Exception as e:  # noqa: BLE001
            await self.repo.update_source(source, status="failed", error_message=str(e)[:500])
            logger.error("indexing failed source=%s: %s", source.id, e)
            raise

        await self.repo.update_source(
            source,
            status="indexed",
            chunk_count=len(chunks),
            last_indexed_at=datetime.now(UTC).replace(tzinfo=None),
        )
        return len(chunks)

    async def index_product(self, product: Product) -> int:
        text = f"{product.title}\n{product.description or ''}"
        if product.bullet_points:
            text += "\n" + "\n".join(str(b) for b in product.bullet_points)

        source = await self.repo.find_product_source(product.tenant_id, product.id)
        if source is None:
            source = await self.repo.create_source(
                tenant_id=product.tenant_id,
                source_type="product",
                title=product.title,
                url=f"product:{product.id}",
                status="pending",
            )
        return await self._index_source(source, text, {"product_id": product.id})

    async def reindex_source(self, source: FlovyKnowledgeSource) -> int:
        return await self._index_source(
            source, source.content or "", {"source_id": source.id}
        )