import requests
from bs4 import BeautifulSoup
import time
import sys

def scrape_patent(patent_url, output_filename='patent_text.txt'):
    """
    Scrapes text content from a Google Patents page and saves it to a text file.
    
    Args:
        patent_url (str): The Google Patents URL to scrape
        output_filename (str): Name of the output text file (default: patent_text.txt)
    """
    
    try:
        # Send GET request to the patent URL
        print(f"Fetching patent from: {patent_url}")
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(patent_url, headers=headers)
        response.raise_for_status()  # Raise an error for bad status codes
        
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Initialize list to store text sections
        patent_text = []
        
        # Extract patent title
        title_element = soup.find('span', {'itemprop': 'title'})
        if title_element:
            patent_text.append(f"TITLE: {title_element.text.strip()}\n")
        
        # Extract patent number
        patent_num_element = soup.find('dd', {'itemprop': 'publicationNumber'})
        if patent_num_element:
            patent_text.append(f"Patent Number: {patent_num_element.text.strip()}\n")
        
        # Extract inventors
        inventors = soup.find_all('dd', {'itemprop': 'inventor'})
        if inventors:
            patent_text.append("\nINVENTORS:")
            for inventor in inventors:
                patent_text.append(f"- {inventor.text.strip()}")
            patent_text.append("")
        
        # Extract abstract
        abstract_element = soup.find('div', {'class': 'abstract'})
        if abstract_element:
            patent_text.append("\nABSTRACT:")
            patent_text.append(abstract_element.text.strip())
            patent_text.append("")
        
        # Extract description sections
        description_sections = soup.find_all('section', {'itemprop': 'description'})
        if description_sections:
            patent_text.append("\nDESCRIPTION:")
            for section in description_sections:
                # Get all text from paragraphs in order
                paragraphs = section.find_all(['p', 'heading'])
                for para in paragraphs:
                    text = para.text.strip()
                    if text:
                        # Check if it's a heading
                        if para.name == 'heading':
                            patent_text.append(f"\n{text.upper()}")
                        else:
                            patent_text.append(text)
                patent_text.append("")
        
        # Extract claims
        claims_section = soup.find('section', {'itemprop': 'claims'})
        if claims_section:
            patent_text.append("\nCLAIMS:")
            claims = claims_section.find_all('div', {'class': 'claim'})
            for claim in claims:
                claim_text = claim.text.strip()
                if claim_text:
                    patent_text.append(claim_text)
                    patent_text.append("")
        
        # Write to file
        with open(output_filename, 'w', encoding='utf-8') as f:
            f.write('\n'.join(patent_text))
        
        print(f"Patent text successfully saved to: {output_filename}")
        print(f"Total lines written: {len(patent_text)}")
        
    except requests.RequestException as e:
        print(f"Error fetching the patent page: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"An error occurred: {e}")
        sys.exit(1)

# Example usage
if __name__ == "__main__":
    print("=== Google Patents Text Scraper ===\n")
    
    # Get patent URL from user
    print("Enter the Google Patents URL to scrape:")
    print("Example: https://patents.google.com/patent/US5859326A/en")
    patent_url = input("URL: ").strip()
    
    # Validate URL
    if not patent_url:
        print("Error: No URL provided. Exiting.")
        sys.exit(1)
    
    if "patents.google.com/patent/" not in patent_url:
        print("Warning: This doesn't look like a Google Patents URL.")
        proceed = input("Continue anyway? (y/n): ").lower()
        if proceed != 'y':
            print("Exiting.")
            sys.exit(0)
    
    # Get output filename from user
    print("\nEnter the output filename (press Enter for default: patent_text.txt):")
    output_file = input("Filename: ").strip()
    
    # Use default if no filename provided
    if not output_file:
        output_file = "patent_text.txt"
    
    # Add .txt extension if not present
    if not output_file.endswith('.txt'):
        output_file += '.txt'
    
    print(f"\nStarting scrape...")
    print(f"URL: {patent_url}")
    print(f"Output file: {output_file}\n")
    
    # Scrape the patent
    scrape_patent(patent_url, output_file)
    
    # Add a small delay to be respectful to the server
    time.sleep(1)