import json
import os
import re
import time
from pathlib import Path

import pandas as pd
import requests
import requests_cache
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from surt import surt
from tqdm.auto import tqdm

s = requests_cache.CachedSession("titles.db")
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

load_dotenv()

def harvest_titles(output="titles_all.ndjson", sample_only=False):
    """
    Scrapes details of all titles from the Pandora website.
    """
    Path(output).unlink(missing_ok=True)
    page = 1
    with tqdm() as pbar:
        # Continue harvesting page by page until there's no results
        while page:
            # Request a page of title links
            response = requests.get(f"http://pandora.nla.gov.au/alpha/ALL/{page}")
            soup = BeautifulSoup(response.text, "lxml")
            title_links = []
            with Path(output).open("a") as titles_file:
                # Find all the item lists on the page and loop through them
                for item_list in soup.find_all("div", class_="itemlist"):
                    # Get all the tep links
                    title_links = item_list.find_all("a", href=re.compile(r"/tep/\d+"))
                    # Save the tep id and name
                    for title_link in title_links:
                        titles_file.write(
                            json.dumps(
                                {
                                    "tep_id": title_link["href"],
                                    "name": title_link.string,
                                }
                            )
                            + "\n"
                        )
            pbar.update(1)
            # If there's title links on this page, increment the page value and continue
            if title_links and not sample_only:
                page += 1
            # If there's no title links then stop harvesting
            else:
                page = None
            time.sleep(0.5)

harvest_titles()

def clean_url(url):
    """
    Get the harvested url from a Pandora snapshot link.
    """
    match = re.search(r"^/?[A-Z0-9]*/?[A-Za-z0-9-]+/", url)
    if match:
        url = url[match.end() :]
    if not url.startswith("http"):
        url = f"http://{url}"
    return url


def add_title_urls(input="titles_all.ndjson", output="title_urls.ndjson"):
    with Path(input).open("r") as input_file:
        with Path(output).open("w") as output_file:
            for line in tqdm(input_file):
                tep_data = json.loads(line)
                # Get TEP JSON
                url = (
                    f"https://webarchive.nla.gov.au/bamboo-service{tep_data['tep_id']}"
                )
                response = s.get(url)
                # Some TEPs produce 500 errors -- seems they're no longer in the archive?
                if response.ok:
                    data = response.json()
                    instance_urls = []
                    # Title record includes multiple instances
                    # An instance can be a different url, or a Pandora snapshot
                    # We want to get all the distinct urls, so we'll trim the Pandora bits from urls and
                    # use surts to merge http, https, www addresses
                    surts = []
                    for instance in data["instances"]:
                        # First we'll use the `gatheredUrl` field
                        if gathered_url := instance.get("gatheredUrl"):
                            # Remove the Pandora part of the url (if there is one)
                            gathered_url = clean_url(gathered_url)
                            try:
                                tep_surt = surt(gathered_url)
                            # This is to handle a broken url
                            except ValueError:
                                gathered_url = gathered_url.replace(
                                    "http://https:", "http://"
                                )
                                tep_surt = surt(gathered_url)
                        # If there's no `gatheredUrl`, we'll use the `url`
                        elif tep_url := instance.get("url"):
                            # Remove Pandora part of link
                            gathered_url = re.search(
                                r"http://pandora.nla.gov.au/pan/\w+/\w+-\w+/(.*)",
                                tep_url,
                            ).group(1)
                            if not gathered_url.startswith("http"):
                                gathered_url = f"http://{gathered_url}"
                            tep_surt = surt(gathered_url)
                        else:
                            tep_surt = None
                        # Add url to list if we don't already have it (check surts)
                        if tep_surt and tep_surt not in surts:
                            instance_urls.append(gathered_url)
                            surts.append(tep_surt)
                    # Save each url
                    for instance_url in sorted(set(instance_urls)):
                        tep_data["gathered_url"] = instance_url
                        tep_data["surt"] = surt(instance_url)
                        output_file.write(json.dumps(tep_data) + "\n")
                    if not response.from_cache:
                        time.sleep(0.5)
                else:
                    output_file.write(json.dumps(tep_data) + "\n")

add_title_urls()

dft = pd.read_json("title_urls.ndjson", lines=True)
dft.to_csv("pandora-titles.csv", index=False, encoding="utf-8-sig")

# IGNORE THIS CELL -- TESTING ONLY
if os.getenv("GW_STATUS") == "dev":
    harvest_titles(output="test.ndjson", sample_only=True)
    add_title_urls(input="test.ndjson", output="test_urls.ndjson")
    Path("test.ndjson").unlink()
    Path("test_urls.ndjson").unlink()

Harvest the full collection of Pandora titles¶

What are titles?¶

Harvesting method¶

Dataset structure¶

Extract archived urls from TEP¶