# Let's import the libraries we need.
import json
import re
import shutil
import time
from io import BytesIO
from pathlib import Path
from zipfile import ZipFile

import arrow
import fitz
import pandas as pd
import requests
from bs4 import BeautifulSoup
from IPython.display import HTML, display
from jupyter_server import serverapp
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from slugify import slugify
from sqlite_utils import Database

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

title_id = "nla.obj-3121636851"

def get_metadata(id):
    """
    Extract work data in a JSON string from the work's HTML page.
    """
    if not id.startswith("http"):
        id = "https://nla.gov.au/" + id
    response = s.get(id)
    try:
        work_data = re.search(
            r"var work = JSON\.parse\(JSON\.stringify\((\{.*\})", response.text
        ).group(1)
    except AttributeError:
        work_data = "{}"
    return json.loads(work_data)


def get_iso_date(date):
    if date:
        iso_date = arrow.get(date, "ddd, D MMM YYYY").format("YYYY")
    else:
        # So we can use this field in facets
        iso_date = "0"
    return iso_date


def get_issues(parent_id):
    """
    Get the ids of issues that are children of the current record.
    """
    start_url = "https://nla.gov.au/{}/browse?startIdx={}&rows=20&op=c"
    # The initial startIdx value
    start = 0
    # Number of results per page
    n = 20
    parts = []
    # If there aren't 20 results on the page then we've reached the end, so continue harvesting until that happens.
    while n == 20:
        # Get the browse page
        response = s.get(start_url.format(parent_id, start))
        # Beautifulsoup turns the HTML into an easily navigable structure
        soup = BeautifulSoup(response.text, "lxml")
        # Find all the divs containing issue details and loop through them
        details = soup.find_all(class_="l-item-info")
        for detail in details:
            title = detail.find("h3")
            if title:
                issue_id = title.parent["href"].strip("/")
            else:
                issue_id = detail.find("a")["href"].strip("/")
            # Get the issue id
            parts.append(issue_id)
        # Increment the startIdx
        start += n
        # Set n to the number of results on the current page
        n = len(details)
    return parts


def harvest_issues(title_id):
    data_dir = Path("downloads", title_id)
    data_dir.mkdir(exist_ok=True, parents=True)
    issues = get_issues(title_id)
    with Path(data_dir, f"{title_id}-issues.ndjson").open("w") as ndjson_file:
        for issue_id in issues:
            metadata = get_metadata(issue_id)
            try:
                issue = {
                    "id": metadata["pid"],
                    "title_id": title_id,
                    "title": metadata["title"],
                    "description": metadata.get("subUnitNo", ""),
                    "date": get_iso_date(metadata.get("issueDate", None)),
                    "url": f"https://nla.gov.au/{metadata['pid']}",
                    "ebook_type": metadata.get("ebookType", ""),
                    "access_conditions": metadata.get("accessConditions", ""),
                    "copyright_policy": metadata.get("copyrightPolicy", ""),
                }
            except KeyError:
                print(title_id)
            else:
                ndjson_file.write(f"{json.dumps(issue)}\n")

harvest_issues(title_id)

def download_pdfs(title_id):
    data_dir = Path("downloads", title_id)
    pdf_dir = Path(data_dir, "pdfs")
    pdf_dir.mkdir(exist_ok=True, parents=True)
    for id in pd.read_json(Path(data_dir, f"{title_id}-issues.ndjson"), lines=True)[
        "id"
    ].to_list():
        pdf_file = f"{id}-1.pdf"
        pdf_path = Path(pdf_dir, pdf_file)
        if not pdf_path.exists():
            # This url downloads a zip file containing the PDF,
            # note that downloadOption needs to be set to eBook
            download_url = f"https://nla.gov.au/{id}/download?downloadOption=eBook&firstPage=-1&lastPage=-1"
            response = s.get(download_url)
            zipped = ZipFile(BytesIO(response.content))
            zipped.extract(pdf_file, path=pdf_dir)
            time.sleep(2)

download_pdfs(title_id)

def clean_text(text):
    """
    Remove linebreaks and extra whitespace from text.
    """
    text = text.replace("\n", " ")
    text = re.sub(r"\s+", " ", text)
    return text.encode()


def extract_text(title_id):
    """
    Extract text from each page of the PDFs and save as separate text files.
    """
    data_dir = Path("downloads", title_id)
    pdf_dir = Path(data_dir, "pdfs")
    text_dir = Path(data_dir, "text")
    text_dir.mkdir(exist_ok=True, parents=True)
    for pdf_file in Path(pdf_dir).glob("*.pdf"):
        pid = pdf_file.stem[:-2]
        doc = fitz.open(pdf_file)
        for i, page in enumerate(doc):
            text_path = Path(text_dir, f"{pid}-p{i+1}.txt")
            text = page.get_text()
            Path(text_path).write_bytes(clean_text(text))

extract_text(title_id)

def save_metadata(title_id, df, db_dir):
    """
    Create a netadata.json file for use with Datasette.
    """
    title = df["title"].iloc[0]
    description = f"{df.shape[0]} issues from {df['date'].min()} to {df['date'].max()}"
    sql = "SELECT issue_id, issue_details, issue_date, page_number, snippet(pages_fts, -1, '<span class=\"has-text-warning\">', '</span>', '...', 50) AS snippet, bm25(pages_fts) AS rank FROM pages JOIN pages_fts ON pages.rowid=pages_fts.rowid WHERE pages_fts match :query AND issue_date >= :start_date AND issue_date <= :end_date ORDER BY rank"
    metadata = {
        "title": "Search NED periodicals",
        "databases": {
            slugify(title): {
                "title": title.strip("."),
                "description": description,
                "source_url": f"https://nla.gov.au/{title_id}",
                "queries": {
                    "search": {
                        "sql": sql,
                        "hide_sql": True,
                        "searchmode": "raw",
                        "title": f"Search {title.strip('.')}",
                        "description": description,
                        "source_url": f"https://nla.gov.au/{title_id}",
                    }
                },
            }
        },
    }
    Path(db_dir, "metadata.json").write_text(json.dumps(metadata, indent=4))
    return title


def create_db(title_id):
    data_dir = Path("downloads", title_id)
    db_dir = Path(data_dir, "datasette")
    db_dir.mkdir(exist_ok=True)
    df = pd.read_json(Path(data_dir, f"{title_id}-issues.ndjson"), lines=True)
    title = save_metadata(title_id, df, db_dir)
    text_dir = Path(data_dir, "text")
    db = Database(Path(db_dir, f"{slugify(title)}.db"), recreate=True)
    for row in df.itertuples():
        for page in sorted(Path(text_dir).glob(f"{row.id}-p*.txt")):
            page_num = re.search(r"-p(\d+).txt", page.name).group(1)
            data = {
                "issue_id": row.id,
                "issue_details": row.description,
                "issue_date": row.date,
                "page_number": page_num,
                "text": page.read_text(),
            }
            db["pages"].insert(data, pk=["issue_id", "page_number"])
    # Index the text
    db["pages"].enable_fts(["text"])

create_db(title_id)

def download_template(title_id):
    db_dir = Path("downloads", title_id, "datasette")
    if not Path(db_dir, "templates").exists():
        response = requests.get(
            "https://github.com/GLAM-Workbench/datasette-lite-search/raw/refs/heads/main/templates.zip"
        )
        zipped = ZipFile(BytesIO(response.content))
        zipped.extractall(path=db_dir)


def get_proxy_url():
    # Get current running servers
    servers = serverapp.list_running_servers()
    base_url = next(servers)["base_url"]
    # Create a base url for Datasette using the proxy path
    proxy_url = f"{base_url}proxy/8001/"
    return proxy_url


def open_datasette(timestamp=None):
    """
    This gets the base url of the currently running notebook. It then uses this url to
    construct a link to your Datasette instance. Finally it creates a button to open up a new tab to view your database.
    """
    download_template(title_id)
    data_dir = Path("downloads", title_id)
    db_dir = Path(data_dir, "datasette")
    db_path = next(Path(db_dir).glob("*.db"))
    proxy_url = get_proxy_url()
    # Display a link to Datasette
    display(
        HTML(
            f'<p><a style="width: 200px; display: block; border: 1px solid #307fc1; background-color: #1976d2; color: #ffffff; padding: 10px; text-align: center; font-weight: bold;"href="{proxy_url}">View in Datasette</a> (Click on the stop button in the top menu bar to close the Datasette server)</p>'
        )
    )
    # Launch Datasette


    !datasette {db_path} --port 8001  --setting base_url {proxy_url} --template-dir {db_dir}/templates --metadata {db_dir}/metadata.json --config truncate_cells_html:100

open_datasette()

# IGNORE THIS CELL -- FOR TESTING ONLY

title_id = "nla.obj-971998493"
harvest_issues(title_id)
download_pdfs(title_id)
extract_text(title_id)
create_db(title_id)
db_dir = Path("downloads", title_id, "datasette")
db_path = next(Path(db_dir).glob("*.db"))
db = Database(db_path)
shutil.rmtree(Path("downloads", title_id))

Create a searchable database from issues of a NED periodical¶

Harvest title and issue metadata¶

Download the PDFs¶

Extract text from the PDFs¶

Load text and metadata into a SQLite database¶

Exploring the SQLite database¶

Run Datasette within the current environment¶

Use Datasette-Lite¶