import requests
from pdfminer.high_level import extract_text
from io import BytesIO
from playwright.sync_api import sync_playwright
from markdownify import markdownify as md


def fetch_page(url: str):
    # まず HEAD でコンテンツタイプを確認
    head = requests.head(url, allow_redirects=True)
    content_type = head.headers.get("Content-Type", "")

    # PDF の場合
    if "pdf" in content_type.lower():
        response = requests.get(url)
        pdf_data = BytesIO(response.content)
        return extract_text(pdf_data)

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)  # 画面なしで起動
        page = browser.new_page()

        # JS 完全実行後の状態を待つ
        page.goto(url, wait_until="networkidle")

        # レンダリング後の HTML を取得
        html = page.content()
        print("Fetched HTML content.", html)
        markdown = md(html)

        browser.close()
        return markdown

if __name__ == "__main__":
    #url = "https://contents.xj-storage.jp/xcontents/AS71239/6501b3e1/5c72/40e5/9bef/5540c3909be1/140120260116535128.pdf"
    url = "https://www.yahoo.co.jp"
    html = fetch_page(url)
    print(html)
