# app/services/scraper.py

import httpx
from typing import Dict, Any, Optional
from app.config import get_settings
import logging
import time
import json
import re
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)

class WebScraper:
    def __init__(self):
        self.settings = get_settings()
        self.session = None
    
    async def __aenter__(self):
        # Enhanced headers to mimic a real browser
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.9",
            "Accept-Encoding": "gzip, deflate, br",
            "DNT": "1",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
        }
        
        self.session = httpx.AsyncClient(
            timeout=self.settings.REQUEST_TIMEOUT,
            headers=headers,
            follow_redirects=True
        )
        return self
    
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        if self.session:
            await self.session.aclose()
    
    def extract_structured_data(self, html: str) -> Optional[str]:
        """Extract content from JSON-LD structured data"""
        try:
            soup = BeautifulSoup(html, 'html.parser')
            json_scripts = soup.find_all('script', type='application/ld+json')
            
            for script in json_scripts:
                try:
                    data = json.loads(script.string)
                    if isinstance(data, list):
                        data = data[0]
                    
                    if isinstance(data, dict):
                        # Look for article content
                        article_body = None
                        if data.get('@type') in ['Article', 'NewsArticle', 'BlogPosting']:
                            article_body = data.get('articleBody')
                        elif 'articleBody' in data:
                            article_body = data['articleBody']
                        
                        if article_body and len(article_body) > 200:
                            return article_body
                            
                except json.JSONDecodeError:
                    continue
        except Exception as e:
            logger.debug(f"Error extracting structured data: {e}")
        
        return None
    
    def extract_content_advanced(self, html: str) -> str:
        """Advanced content extraction using multiple strategies"""
        soup = BeautifulSoup(html, 'html.parser')
        
        # Remove unwanted elements
        for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'button']):
            element.decompose()
        
        # Remove elements with certain classes/ids that typically contain ads or navigation
        unwanted_selectors = [
            '[class*="ad"]', '[id*="ad"]', '[class*="advertisement"]',
            '[class*="nav"]', '[class*="menu"]', '[class*="sidebar"]',
            '[class*="related"]', '[class*="comment"]', '[class*="social"]',
            '[class*="share"]', '[class*="footer"]', '[class*="header"]'
        ]
        
        for selector in unwanted_selectors:
            for element in soup.select(selector):
                element.decompose()
        
        # Priority selectors for article content
        content_selectors = [
            'article',
            '[role="main"]',
            '.article-content', '.article-body', '.article-text',
            '.post-content', '.post-body', '.post-text',
            '.entry-content', '.entry-body',
            '.content-body', '.main-content',
            '.story-body', '.story-content',
            'main',
            '.content'
        ]
        
        best_content = ""
        best_length = 0
        
        # Try each selector and keep the longest content
        for selector in content_selectors:
            elements = soup.select(selector)
            for element in elements:
                # Remove any remaining unwanted nested elements
                for unwanted in element.find_all(['script', 'style', 'nav', 'aside', 'footer', 'header']):
                    unwanted.decompose()
                
                text = element.get_text(separator=' ', strip=True)
                text = re.sub(r'\s+', ' ', text)
                
                if len(text) > best_length and len(text) > 200:
                    best_content = text
                    best_length = len(text)
        
        # If no good content found with selectors, try paragraph-based extraction
        if not best_content:
            paragraphs = soup.find_all('p')
            if paragraphs:
                # Get all paragraph text
                all_text = []
                for p in paragraphs:
                    text = p.get_text(strip=True)
                    if len(text) > 50:  # Only include substantial paragraphs
                        all_text.append(text)
                
                if all_text:
                    best_content = ' '.join(all_text)
        
        return best_content.strip()
    
    def extract_with_simple_readability(self, html: str) -> str:
        """Simple readability-like extraction without external library"""
        soup = BeautifulSoup(html, 'html.parser')
        
        # Remove unwanted elements
        for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
            element.decompose()
        
        # Score paragraphs by content length and position
        paragraphs = soup.find_all('p')
        scored_paragraphs = []
        
        for i, p in enumerate(paragraphs):
            text = p.get_text(strip=True)
            if len(text) > 25:  # Only consider substantial paragraphs
                score = len(text)
                # Boost score for paragraphs in the middle (likely content)
                if i > len(paragraphs) * 0.1 and i < len(paragraphs) * 0.8:
                    score *= 1.5
                scored_paragraphs.append((score, text))
        
        # Sort by score and take top paragraphs
        scored_paragraphs.sort(reverse=True)
        top_paragraphs = [text for score, text in scored_paragraphs[:10]]
        
        return ' '.join(top_paragraphs)
    
    async def try_different_user_agents(self, url: str) -> Optional[str]:
        """Try different user agents to get complete content"""
        user_agents = [
            # Mobile user agents sometimes get different content
            "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1",
            # Different desktop browsers
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0",
        ]
        
        for ua in user_agents:
            try:
                async with httpx.AsyncClient(
                    timeout=self.settings.REQUEST_TIMEOUT,
                    headers={"User-Agent": ua},
                    follow_redirects=True
                ) as client:
                    response = await client.get(url)
                    response.raise_for_status()
                    
                    # Try to extract content
                    content = self.extract_content_advanced(response.text)
                    
                    # If we get substantial content, return it
                    if content and len(content) > 500:
                        return content
                        
            except Exception as e:
                logger.debug(f"Failed with user agent {ua}: {e}")
                continue
        
        return None
    
    async def scrape_url(self, url: str) -> Dict[str, Any]:
        """Scrape complete content from a URL - enhanced version without external dependencies"""
        start_time = time.time()
        
        try:
            # Fetch HTML content
            response = await self.session.get(url)
            response.raise_for_status()
            
            html_content = response.text
            
            # Try different content extraction methods
            extracted_content = ""
            
            # 1. Try structured data extraction (most reliable)
            structured_content = self.extract_structured_data(html_content)
            if structured_content and len(structured_content) > 200:
                extracted_content = structured_content
                logger.info(f"Extracted content using structured data for {url}")
            
            # 2. Try advanced selector-based extraction
            if not extracted_content:
                advanced_content = self.extract_content_advanced(html_content)
                if advanced_content and len(advanced_content) > 200:
                    extracted_content = advanced_content
                    logger.info(f"Extracted content using advanced selectors for {url}")
            
            # 3. Try simple readability-like extraction
            if not extracted_content:
                simple_content = self.extract_with_simple_readability(html_content)
                if simple_content and len(simple_content) > 200:
                    extracted_content = simple_content
                    logger.info(f"Extracted content using simple readability for {url}")
            
            # 4. If content seems truncated or short, try different user agents
            if not extracted_content or len(extracted_content) < 500 or "..." in extracted_content[-100:]:
                logger.info(f"Content seems incomplete for {url}, trying different user agents")
                ua_content = await self.try_different_user_agents(url)
                if ua_content and len(ua_content) > len(extracted_content):
                    extracted_content = ua_content
                    logger.info(f"Got better content with different user agent for {url}")
            
            # Clean up the final content
            if extracted_content:
                extracted_content = re.sub(r'\s+', ' ', extracted_content).strip()
            
            # If we still don't have good content, fall back to original HTML
            final_content = extracted_content if extracted_content else html_content
            
            processing_time = time.time() - start_time
            
            return {
                "url": url,
                "html_source": final_content,  # Contains clean content instead of raw HTML
                "status": "success",
                "processing_time": processing_time,
                "scraped_at": time.time(),
                "status_code": response.status_code,
                "content_length": len(final_content),
                "headers": dict(response.headers)
            }
            
        except httpx.HTTPError as e:
            logger.error(f"HTTP error scraping {url}: {e}")
            return {
                "url": url,
                "html_source": None,
                "status": "error",
                "error": f"HTTP error: {str(e)}",
                "processing_time": time.time() - start_time,
                "scraped_at": time.time()
            }
        except Exception as e:
            logger.error(f"Error scraping {url}: {e}")
            return {
                "url": url,
                "html_source": None,
                "status": "error", 
                "error": str(e),
                "processing_time": time.time() - start_time,
                "scraped_at": time.time()
            }