import requests
from bs4 import BeautifulSoup
import time
import sys

def scrape_patent(patent_url, output_filename=None):
    """
    Scrapes text content from a Google Patents page and saves it to a text file.
    
    Args:
        patent_url (str): The Google Patents URL to scrape
        output_filename (str): Name of the output text file (default: uses patent number)
    """
    
    try:
        # Send GET request to the patent URL
        print(f"Fetching patent from: {patent_url}")
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(patent_url, headers=headers)
        response.raise_for_status()  # Raise an error for bad status codes
        
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Initialize list to store text sections
        patent_text = []
        
        # Extract patent number first (for filename)
        patent_number = None
        patent_num_element = soup.find('dd', {'itemprop': 'publicationNumber'})
        if patent_num_element:
            patent_number = patent_num_element.text.strip()
        
        # If no output filename specified, use patent number
        if not output_filename:
            if patent_number:
                output_filename = f"{patent_number}_patent.txt"
            else:
                output_filename = "patent_text.txt"
        
        # Extract patent title
        title_element = soup.find('span', {'itemprop': 'title'})
        if title_element:
            patent_text.append(f"TITLE: {title_element.text.strip()}\n")
        
        # Extract patent number
        patent_num_element = soup.find('dd', {'itemprop': 'publicationNumber'})
        if patent_num_element:
            patent_text.append(f"Patent Number: {patent_num_element.text.strip()}\n")
        
        # Extract inventors
        inventors = soup.find_all('dd', {'itemprop': 'inventor'})
        if inventors:
            patent_text.append("\nINVENTORS:")
            for inventor in inventors:
                patent_text.append(f"- {inventor.text.strip()}")
            patent_text.append("")
        
        # Extract abstract
        abstract_element = soup.find('div', {'class': 'abstract'})
        if abstract_element:
            patent_text.append("\nABSTRACT:")
            patent_text.append(abstract_element.text.strip())
            patent_text.append("")
        
        # Extract description sections - Google Patents puts text in div.description
        description_div = soup.find('div', {'class': 'description'})
        if description_div:
            patent_text.append("\nDESCRIPTION:")
            
            # Extract all text content including headers and paragraphs
            for element in description_div.descendants:
                if element.name in ['heading', 'h1', 'h2', 'h3', 'h4']:
                    text = element.get_text(strip=True)
                    if text:
                        patent_text.append(f"\n{text.upper()}")
                elif element.name == 'p' or (element.name is None and isinstance(element, str)):
                    # Handle both <p> tags and direct text nodes
                    if element.name == 'p':
                        text = element.get_text(strip=True)
                    else:
                        text = element.strip()
                    
                    if text and len(text) > 2:  # Skip very short text fragments
                        # Skip if this text is already captured by a parent element
                        if not any(text in item for item in patent_text[-5:] if item):
                            patent_text.append(text)
            
            patent_text.append("")
        
        # Alternative method: Look for section tags with itemprop="description"
        if not description_div or len([item for item in patent_text if item.strip()]) < 10:
            description_sections = soup.find_all('section', {'itemprop': 'description'})
            if description_sections:
                patent_text.append("\nDESCRIPTION (Alternative extraction):")
                for section in description_sections:
                    # Get all text content, preserving structure
                    text_content = section.get_text(separator='\n', strip=True)
                    if text_content:
                        # Split by newlines and add non-empty lines
                        lines = text_content.split('\n')
                        for line in lines:
                            if line.strip() and len(line.strip()) > 2:
                                patent_text.append(line.strip())
                patent_text.append("")
        
        # Extract claims - try multiple methods
        claims_section = soup.find('section', {'itemprop': 'claims'})
        if claims_section:
            patent_text.append("\nCLAIMS:")
            
            # First try: look for div.claim elements
            claims = claims_section.find_all('div', {'class': 'claim'})
            if claims:
                for i, claim in enumerate(claims, 1):
                    claim_text = claim.get_text(strip=True)
                    if claim_text:
                        patent_text.append(f"{i}. {claim_text}")
                        patent_text.append("")
            else:
                # Fallback: get all text from claims section
                claims_text = claims_section.get_text(separator='\n', strip=True)
                if claims_text:
                    patent_text.append(claims_text)
                    patent_text.append("")
        
        # Also check for claims in a div with class 'claims'
        if not claims_section:
            claims_div = soup.find('div', {'class': 'claims'})
            if claims_div:
                patent_text.append("\nCLAIMS:")
                claims_text = claims_div.get_text(separator='\n', strip=True)
                if claims_text:
                    patent_text.append(claims_text)
                    patent_text.append("")
        
        # Write to file
        with open(output_filename, 'w', encoding='utf-8') as f:
            f.write('\n'.join(patent_text))
        
        print(f"Patent text successfully saved to: {output_filename}")
        print(f"Total lines written: {len(patent_text)}")
        
    except requests.RequestException as e:
        print(f"Error fetching the patent page: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"An error occurred: {e}")
        sys.exit(1)

# Example usage
if __name__ == "__main__":
    print("=== Google Patents Text Scraper ===\n")
    
    # Get patent URL from user
    print("Enter the Google Patents URL to scrape:")
    print("Example: https://patents.google.com/patent/US5859326A/en")
    patent_url = input("URL: ").strip()
    
    # Validate URL
    if not patent_url:
        print("Error: No URL provided. Exiting.")
        sys.exit(1)
    
    if "patents.google.com/patent/" not in patent_url:
        print("Warning: This doesn't look like a Google Patents URL.")
        proceed = input("Continue anyway? (y/n): ").lower()
        if proceed != 'y':
            print("Exiting.")
            sys.exit(0)
    
    # Get output filename from user
    print("\nEnter the output filename (press Enter to use patent number as filename):")
    output_file = input("Filename: ").strip()
    
    # If no filename provided, it will be set based on patent number
    if output_file:
        # Add .txt extension if not present
        if not output_file.endswith('.txt'):
            output_file += '.txt'
    else:
        output_file = None  # Let the function determine based on patent number
    
    print(f"\nStarting scrape...")
    if output_file:
        print(f"Output file: {output_file}")
    else:
        print("Output file: Will use patent number as filename")
    print(f"URL: {patent_url}\n")
    
    # Scrape the patent
    scrape_patent(patent_url, output_file)
    
    # Add a small delay to be respectful to the server
    time.sleep(1)