# app/api/routes/scraper.py

from fastapi import APIRouter, Depends, HTTPException, Query, BackgroundTasks
from fastapi.responses import JSONResponse
from app.models.schemas import ScrapeRequest, ScrapeResponse, LLMProcessRequest, LLMProcessResponse
from app.services.scraper import WebScraper
from app.services.llm_processor import get_ollama_processor
from app.services.database import MongoDB
from app.models.article import Article
from typing import List, Optional
import logging

logger = logging.getLogger(__name__)
router = APIRouter()

# Get Ollama LLM processor instance
def get_llm_processor():
    return get_ollama_processor()

async def get_database():
    """Dependency to get database client."""
    from app.main import app
    return app.state.db.get_client()

@router.post("/scrape", response_model=ScrapeResponse)
async def scrape_webpage(
    request: ScrapeRequest,
    process_with_llm: bool = Query(default=False, description="Process HTML with LLM after scraping"),
    db_client = Depends(get_database)
):
    """Scrape complete HTML source code from the provided URL and optionally process with LLM."""
    try:
        # Initialize scraper
        async with WebScraper() as scraper:
            # Scrape the URL (gets complete HTML)
            result = await scraper.scrape_url(str(request.url))
            
            # If LLM processing is requested and scraping was successful
            if process_with_llm and result["status"] == "success":
                logger.info(f"Processing {request.url} with Ollama Qwen LLM...")
                
                # Get LLM processor and process with LLM
                llm_processor = get_llm_processor()
                llm_result = await llm_processor.extract_content_from_html(
                    result["html_source"], 
                    str(request.url)
                )
                
                # Add LLM results to the main result
                result["llm_extraction"] = llm_result
                result["processed_with_llm"] = True
            else:
                result["processed_with_llm"] = False
            
            # Save to MongoDB
            article_model = Article(db_client)
            page_id = await article_model.create_html_page(result)
            
            # Return response
            return ScrapeResponse(
                id=page_id,
                url=result["url"],
                status=result["status"],
                html_source=result.get("html_source"),
                processing_time=result.get("processing_time"),
                status_code=result.get("status_code"),
                content_length=result.get("content_length"),
                scraped_at=result.get("scraped_at"),
                processed_with_llm=result.get("processed_with_llm", False),
                llm_extraction=result.get("llm_extraction"),
                error=result.get("error")
            )
            
    except Exception as e:
        logger.error(f"Error in scrape_webpage: {e}")
        raise HTTPException(status_code=500, detail=str(e))

@router.post("/process-with-llm/{page_id}")
async def process_existing_page_with_llm(
    page_id: str,
    db_client = Depends(get_database)
):
    """Process an already scraped page with Qwen-7B LLM."""
    try:
        # Get the page from database
        article_model = Article(db_client)
        page = await article_model.get_html_page(page_id)
        
        if not page:
            raise HTTPException(status_code=404, detail="Page not found")
        
        if not page.get("html_source"):
            raise HTTPException(status_code=400, detail="No HTML source available for processing")
        
        logger.info(f"Processing page {page_id} with Ollama Qwen LLM...")
        
        # Get LLM processor and process with LLM
        llm_processor = get_llm_processor()
        llm_result = await llm_processor.extract_content_from_html(
            page["html_source"], 
            page["url"]
        )
        
        # Update the page with LLM results
        update_data = {
            "llm_extraction": llm_result,
            "processed_with_llm": True
        }
        
        await article_model.update_html_page(page_id, update_data)
        
        return {
            "page_id": page_id,
            "url": page["url"],
            "llm_processing_status": llm_result["status"],
            "extracted_content": llm_result.get("extracted_content")
        }
        
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error in process_existing_page_with_llm: {e}")
        raise HTTPException(status_code=500, detail=str(e))

@router.post("/batch-process-llm")
async def batch_process_with_llm(
    limit: int = Query(default=10, le=50, description="Number of pages to process"),
    skip_processed: bool = Query(default=True, description="Skip already processed pages"),
    db_client = Depends(get_database)
):
    """Process multiple unprocessed pages with LLM in batch."""
    try:
        article_model = Article(db_client)
        
        # Get unprocessed pages
        filter_query = {"processed_with_llm": {"$ne": True}} if skip_processed else {}
        pages = await article_model.get_pages_with_filter(filter_query, limit=limit)
        
        if not pages:
            return {"message": "No pages to process", "processed_count": 0}
        
        logger.info(f"Batch processing {len(pages)} pages with Qwen-7B-Chat LLM...")
        
        # Get LLM processor and process pages
        llm_processor = get_llm_processor()
        results = await llm_processor.process_batch(pages)
        
        # Update database with results
        processed_count = 0
        for result in results:
            if result.get("status") == "success":
                update_data = {
                    "llm_extraction": {
                        "status": result["status"],
                        "extracted_content": result.get("extracted_content"),
                        "raw_llm_output": result.get("raw_llm_output")
                    },
                    "processed_with_llm": True
                }
                await article_model.update_html_page(result["page_id"], update_data)
                processed_count += 1
        
        return {
            "message": f"Batch processing completed",
            "total_pages": len(pages),
            "processed_count": processed_count,
            "failed_count": len(pages) - processed_count
        }
        
    except Exception as e:
        logger.error(f"Error in batch_process_with_llm: {e}")
        raise HTTPException(status_code=500, detail=str(e))

@router.get("/llm/model-info")
async def get_llm_model_info():
    """Get information about the loaded Ollama model."""
    try:
        llm_processor = get_llm_processor()
        model_info = await llm_processor.get_model_info()
        return model_info
    except Exception as e:
        logger.error(f"Error getting model info: {e}")
        raise HTTPException(status_code=500, detail=str(e))

@router.post("/llm/load-model")
async def load_llm_model():
    """Ensure the Ollama model is loaded."""
    try:
        llm_processor = get_llm_processor()
        success = await llm_processor.ensure_model_loaded()
        if success:
            return {"message": "Ollama model is ready"}
        else:
            return {"message": "Failed to load Ollama model", "status": "error"}
    except Exception as e:
        logger.error(f"Error loading LLM model: {e}")
        raise HTTPException(status_code=500, detail=str(e))

@router.get("/pages/{page_id}", response_model=ScrapeResponse)
async def get_scraped_page(
    page_id: str,
    include_html: bool = Query(default=False, description="Include full HTML source in response"),
    db_client = Depends(get_database)
):
    """Get a scraped HTML page by ID."""
    try:
        article_model = Article(db_client)
        page = await article_model.get_html_page(page_id)
        
        if not page:
            raise HTTPException(status_code=404, detail="Page not found")
        
        # Optionally exclude HTML source for lighter responses
        html_source = page.get("html_source") if include_html else None
        
        return ScrapeResponse(
            id=page["id"],
            url=page["url"],
            status=page["status"],
            html_source=html_source,
            processing_time=page.get("processing_time"),
            status_code=page.get("status_code"),
            content_length=page.get("content_length"),
            scraped_at=page.get("scraped_at"),
            processed_with_llm=page.get("processed_with_llm", False),
            llm_extraction=page.get("llm_extraction"),
            error=page.get("error")
        )
        
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error in get_scraped_page: {e}")
        raise HTTPException(status_code=500, detail=str(e))

@router.get("/pages", response_model=List[ScrapeResponse])
async def get_all_pages(
    limit: int = Query(default=20, le=100, description="Maximum number of pages to return"),
    skip: int = Query(default=0, ge=0, description="Number of pages to skip"),
    include_html: bool = Query(default=False, description="Include full HTML source in response"),
    llm_processed_only: bool = Query(default=False, description="Return only LLM processed pages"),
    db_client = Depends(get_database)
):
    """Get all scraped pages with pagination and filtering options."""
    try:
        article_model = Article(db_client)
        
        # Build filter
        filter_query = {}
        if llm_processed_only:
            filter_query["processed_with_llm"] = True
        
        pages = await article_model.get_pages_with_filter(
            filter_query, 
            limit=limit, 
            skip=skip
        )
        
        return [
            ScrapeResponse(
                id=page["id"],
                url=page["url"],
                status=page["status"],
                html_source=page.get("html_source") if include_html else None,
                processing_time=page.get("processing_time"),
                status_code=page.get("status_code"),
                content_length=page.get("content_length"),
                scraped_at=page.get("scraped_at"),
                processed_with_llm=page.get("processed_with_llm", False),
                llm_extraction=page.get("llm_extraction"),
                error=page.get("error")
            )
            for page in pages
        ]
        
    except Exception as e:
        logger.error(f"Error in get_all_pages: {e}")
        raise HTTPException(status_code=500, detail=str(e))

@router.get("/stats")
async def get_scraping_stats(
    db_client = Depends(get_database)
):
    """Get comprehensive scraping and LLM processing statistics."""
    try:
        article_model = Article(db_client)
        
        total_pages = await article_model.get_pages_count()
        llm_processed = await article_model.get_pages_count({"processed_with_llm": True})
        successful_scrapes = await article_model.get_pages_count({"status": "success"})
        failed_scrapes = await article_model.get_pages_count({"status": "error"})
        
        return {
            "total_pages_scraped": total_pages,
            "successful_scrapes": successful_scrapes,
            "failed_scrapes": failed_scrapes,
            "llm_processed_pages": llm_processed,
            "pending_llm_processing": successful_scrapes - llm_processed,
            "llm_processing_rate": (llm_processed / successful_scrapes * 100) if successful_scrapes > 0 else 0,
            "collection_name": "html_pages",
            "model_info": await get_llm_processor().get_model_info()
        }
        
    except Exception as e:
        logger.error(f"Error in get_scraping_stats: {e}")
        raise HTTPException(status_code=500, detail=str(e))

@router.get("/health")
async def health_check():
    """Health check endpoint."""
    return {
        "status": "healthy", 
        "service": "html_scraper_with_ollama_qwen",
        "llm_service": "ollama",
    }