"""
Web scraping service for collecting articles from various search engines
"""
import asyncio
import hashlib
import logging
import re
import time
from datetime import datetime
from typing import List, Dict, Any, Optional
from urllib.parse import urljoin, urlparse
import aiohttp
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException

from app.config.settings import get_settings
from app.models.article import Article, SearchJob, SearchEngine, ArticleStatus
from app.services.kafka_service import KafkaService

logger = logging.getLogger(__name__)


class ScraperService:
    """Main scraping service for collecting articles"""
    
    def __init__(self):
        self.settings = get_settings()
        self.kafka_service = KafkaService()
        self.ua = UserAgent()
        self.session = None
    
    async def get_session(self) -> aiohttp.ClientSession:
        """Get or create aiohttp session"""
        if self.session is None or self.session.closed:
            timeout = aiohttp.ClientTimeout(total=self.settings.REQUEST_TIMEOUT)
            self.session = aiohttp.ClientSession(
                timeout=timeout,
                headers={'User-Agent': self.ua.random}
            )
        return self.session
    
    async def close_session(self):
        """Close aiohttp session"""
        if self.session and not self.session.closed:
            await self.session.close()
    
    def get_chrome_driver(self) -> webdriver.Chrome:
        """Get Chrome WebDriver instance"""
        chrome_options = Options()
        
        if self.settings.SELENIUM_HEADLESS:
            chrome_options.add_argument("--headless")
        
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument(f"--user-agent={self.ua.random}")
        
        if self.settings.CHROME_DRIVER_PATH:
            driver = webdriver.Chrome(
                executable_path=self.settings.CHROME_DRIVER_PATH,
                options=chrome_options
            )
        else:
            driver = webdriver.Chrome(options=chrome_options)
        
        driver.set_page_load_timeout(self.settings.SELENIUM_TIMEOUT)
        return driver
    
    async def search_google(self, query: str, max_results: int = 100) -> List[Dict[str, Any]]:
        """Search Google for articles"""
        results = []
        
        try:
            # Use requests for Google search
            search_url = "https://www.google.com/search"
            params = {
                'q': query,
                'num': min(max_results, 100),  # Google limits to 100 per page
                'hl': 'en',
                'gl': 'us'
            }
            
            headers = {
                'User-Agent': self.ua.random,
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.5',
                'Accept-Encoding': 'gzip, deflate',
                'Connection': 'keep-alive',
            }
            
            response = requests.get(search_url, params=params, headers=headers, timeout=self.settings.REQUEST_TIMEOUT)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Parse Google search results
            search_results = soup.find_all('div', class_='g')
            
            for idx, result in enumerate(search_results[:max_results]):
                try:
                    # Extract title and URL
                    title_elem = result.find('h3')
                    link_elem = result.find('a')
                    
                    if title_elem and link_elem:
                        title = title_elem.get_text(strip=True)
                        url = link_elem.get('href')
                        
                        if url and url.startswith('http'):
                            # Extract snippet
                            snippet_elem = result.find('span', class_=['aCOpRe', 'st'])
                            snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
                            
                            results.append({
                                'title': title,
                                'url': url,
                                'snippet': snippet,
                                'search_engine': SearchEngine.GOOGLE.value,
                                'search_rank': idx + 1,
                                'search_query': query
                            })
                
                except Exception as e:
                    logger.warning(f"Error parsing Google search result: {e}")
                    continue
            
            logger.info(f"Found {len(results)} results from Google for query: {query}")
            
        except Exception as e:
            logger.error(f"Error searching Google: {e}")
        
        return results
    
    async def search_bing(self, query: str, max_results: int = 100) -> List[Dict[str, Any]]:
        """Search Bing for articles"""
        results = []
        
        try:
            search_url = "https://www.bing.com/search"
            params = {
                'q': query,
                'count': min(max_results, 50),  # Bing limits
                'mkt': 'en-US'
            }
            
            headers = {
                'User-Agent': self.ua.random,
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            }
            
            response = requests.get(search_url, params=params, headers=headers, timeout=self.settings.REQUEST_TIMEOUT)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Parse Bing search results
            search_results = soup.find_all('li', class_='b_algo')
            
            for idx, result in enumerate(search_results[:max_results]):
                try:
                    # Extract title and URL
                    title_elem = result.find('h2')
                    link_elem = title_elem.find('a') if title_elem else None
                    
                    if title_elem and link_elem:
                        title = title_elem.get_text(strip=True)
                        url = link_elem.get('href')
                        
                        if url and url.startswith('http'):
                            # Extract snippet
                            snippet_elem = result.find('p')
                            snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
                            
                            results.append({
                                'title': title,
                                'url': url,
                                'snippet': snippet,
                                'search_engine': SearchEngine.BING.value,
                                'search_rank': idx + 1,
                                'search_query': query
                            })
                
                except Exception as e:
                    logger.warning(f"Error parsing Bing search result: {e}")
                    continue
            
            logger.info(f"Found {len(results)} results from Bing for query: {query}")
            
        except Exception as e:
            logger.error(f"Error searching Bing: {e}")
        
        return results
    
    async def scrape_article_content(self, url: str) -> Dict[str, Any]:
        """Scrape article content from URL"""
        try:
            session = await self.get_session()
            
            async with session.get(url) as response:
                if response.status != 200:
                    logger.warning(f"HTTP {response.status} for URL: {url}")
                    return {}
                
                html_content = await response.text()
                soup = BeautifulSoup(html_content, 'html.parser')
                
                # Extract article content
                article_data = {
                    'url': url,
                    'source_domain': urlparse(url).netloc,
                    'scraped_date': datetime.utcnow()
                }
                
                # Try to extract title
                title = None
                for selector in ['h1', 'title', '.article-title', '.post-title']:
                    title_elem = soup.select_one(selector)
                    if title_elem:
                        title = title_elem.get_text(strip=True)
                        break
                
                if title:
                    article_data['title'] = title
                
                # Try to extract content
                content = None
                content_selectors = [
                    'article', '.article-content', '.post-content', 
                    '.entry-content', '.content', 'main', '.main-content'
                ]
                
                for selector in content_selectors:
                    content_elem = soup.select_one(selector)
                    if content_elem:
                        # Remove script and style elements
                        for script in content_elem(["script", "style"]):
                            script.decompose()
                        content = content_elem.get_text(strip=True)
                        break
                
                if not content:
                    # Fallback to body content
                    body = soup.find('body')
                    if body:
                        for script in body(["script", "style", "nav", "header", "footer"]):
                            script.decompose()
                        content = body.get_text(strip=True)
                
                if content and len(content) >= self.settings.ARTICLE_CONTENT_MIN_LENGTH:
                    article_data['content'] = content
                    article_data['word_count'] = len(content.split())
                    article_data['reading_time'] = max(1, article_data['word_count'] // 200)  # Assume 200 WPM
                
                # Try to extract author
                author_selectors = ['.author', '.byline', '[rel="author"]', '.post-author']
                for selector in author_selectors:
                    author_elem = soup.select_one(selector)
                    if author_elem:
                        article_data['author'] = author_elem.get_text(strip=True)
                        break
                
                # Try to extract published date
                date_selectors = ['time', '.date', '.published', '.post-date']
                for selector in date_selectors:
                    date_elem = soup.select_one(selector)
                    if date_elem:
                        date_text = date_elem.get('datetime') or date_elem.get_text(strip=True)
                        # You might want to add date parsing logic here
                        break
                
                # Extract images
                images = []
                for img in soup.find_all('img', src=True):
                    img_url = urljoin(url, img['src'])
                    if img_url.startswith('http'):
                        images.append(img_url)
                
                if images:
                    article_data['images'] = images[:10]  # Limit to 10 images
                
                # Generate content hash for duplicate detection
                if content:
                    article_data['content_hash'] = hashlib.md5(content.encode()).hexdigest()
                
                return article_data
                
        except Exception as e:
            logger.error(f"Error scraping article content from {url}: {e}")
            return {}
    
    async def process_search_results(self, search_results: List[Dict[str, Any]], job_id: str) -> List[str]:
        """Process search results and scrape article content"""
        article_ids = []
        
        for result in search_results:
            try:
                # Add delay between requests
                await asyncio.sleep(self.settings.DELAY_BETWEEN_REQUESTS)
                
                # Scrape article content
                article_data = await self.scrape_article_content(result['url'])
                
                if article_data and article_data.get('content'):
                    # Merge search result data with scraped content
                    article_data.update(result)
                    
                    # Check for duplicates
                    if article_data.get('content_hash'):
                        existing_article = Article.objects(content_hash=article_data['content_hash']).first()
                        if existing_article:
                            article_data['is_duplicate'] = True
                            article_data['duplicate_of'] = str(existing_article.id)
                            
                            # Publish duplicate detected event
                            await self.kafka_service.publish_duplicate_detected(
                                article_data, str(existing_article.id)
                            )
                    
                    # Save article to database
                    article = Article(**article_data)
                    article.status = ArticleStatus.COMPLETED.value
                    article.save()
                    
                    article_ids.append(str(article.id))
                    
                    # Publish article scraped event
                    await self.kafka_service.publish_article_scraped({
                        'id': str(article.id),
                        **article_data
                    })
                    
                    logger.info(f"Successfully scraped article: {article_data.get('title', 'Unknown')}")
                
                else:
                    logger.warning(f"Failed to scrape content from: {result['url']}")
            
            except Exception as e:
                logger.error(f"Error processing search result {result.get('url', 'Unknown')}: {e}")
                continue
        
        return article_ids
    
    async def execute_search_job(self, job: SearchJob) -> bool:
        """Execute a search job"""
        try:
            # Update job status
            job.status = ArticleStatus.PROCESSING.value
            job.started_date = datetime.utcnow()
            job.save()
            
            # Publish job started event
            await self.kafka_service.publish_search_job_started({
                'job_id': job.job_id,
                'search_query': job.search_query,
                'search_engines': job.search_engines,
                'max_results': job.max_results,
                'created_by': job.created_by
            })
            
            all_results = []
            
            # Search each engine
            for engine in job.search_engines:
                try:
                    if engine == SearchEngine.GOOGLE.value:
                        results = await self.search_google(job.search_query, job.max_results)
                    elif engine == SearchEngine.BING.value:
                        results = await self.search_bing(job.search_query, job.max_results)
                    else:
                        logger.warning(f"Unsupported search engine: {engine}")
                        continue
                    
                    all_results.extend(results)
                    
                except Exception as e:
                    logger.error(f"Error searching {engine}: {e}")
                    job.errors.append({
                        'engine': engine,
                        'error': str(e),
                        'timestamp': datetime.utcnow()
                    })
            
            # Remove duplicates based on URL
            unique_results = []
            seen_urls = set()
            
            for result in all_results:
                if result['url'] not in seen_urls:
                    unique_results.append(result)
                    seen_urls.add(result['url'])
            
            job.total_found = len(unique_results)
            
            # Process search results
            article_ids = await self.process_search_results(unique_results, job.job_id)
            
            # Update job with results
            job.total_scraped = len(article_ids)
            job.total_failed = job.total_found - job.total_scraped
            job.articles_ids = article_ids
            job.status = ArticleStatus.COMPLETED.value
            job.completed_date = datetime.utcnow()
            job.save()
            
            # Calculate duration
            duration = (job.completed_date - job.started_date).total_seconds()
            
            # Publish job completed event
            await self.kafka_service.publish_search_job_completed({
                'job_id': job.job_id,
                'search_query': job.search_query,
                'total_found': job.total_found,
                'total_scraped': job.total_scraped,
                'total_failed': job.total_failed,
                'duration': duration,
                'status': job.status
            })
            
            logger.info(f"Search job {job.job_id} completed successfully. Scraped {job.total_scraped}/{job.total_found} articles")
            return True
            
        except Exception as e:
            logger.error(f"Error executing search job {job.job_id}: {e}")
            
            # Update job status to failed
            job.status = ArticleStatus.FAILED.value
            job.errors.append({
                'error': str(e),
                'timestamp': datetime.utcnow()
            })
            job.save()
            
            # Publish job failed event
            await self.kafka_service.publish_search_job_failed({
                'job_id': job.job_id,
                'search_query': job.search_query,
                'total_scraped': job.total_scraped
            }, str(e))
            
            return False
        
        finally:
            await self.close_session()
    
    def __del__(self):
        """Cleanup when service is destroyed"""
        if hasattr(self, 'session') and self.session and not self.session.closed:
            asyncio.create_task(self.close_session())