# Let's import the libraries we need.
import json
import re
import time
from pathlib import Path

import requests
from bs4 import BeautifulSoup
from natsort import natsorted, ns
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from slugify import slugify
from sqlite_utils import Database
from tqdm.auto import tqdm

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

def harvest_metadata(obj_id):
    """
    This calls an internal API from a journal landing page to extract a list of available issues.
    """
    # The initial startIdx value
    start = 0
    # Number of results per page
    n = 20
    issues = []
    with tqdm(desc="Issues", leave=False) as pbar:
        # If there aren't 20 results on the page then we've reached the end, so continue harvesting until that happens.
        while n == 20:
            # Get the browse page
            response = s.get(
                f"https://nla.gov.au/{obj_id}/browse?startIdx={start}&rows=20&op=c",
                timeout=60,
            )
            # Beautifulsoup turns the HTML into an easily navigable structure
            soup = BeautifulSoup(response.text, "lxml")
            # Find all the divs containing issue details and loop through them
            details = soup.find_all(class_="l-item-info")
            for detail in details:
                issue = {}
                title = detail.find("h3")
                if title:
                    issue["title"] = title.text
                    issue["id"] = title.parent["href"].strip("/")
                else:
                    issue["title"] = "No title"
                    issue["id"] = detail.find("a")["href"].strip("/")
                try:
                    # Get the issue details
                    issue["details"] = detail.find(
                        class_="obj-reference content"
                    ).string.strip()
                except (AttributeError, IndexError):
                    issue["details"] = "issue"
                # Get the number of pages
                try:
                    issue["pages"] = int(
                        re.search(
                            r"^(\d+)",
                            detail.find("a", attrs={"data-pid": issue["id"]}).text,
                            flags=re.MULTILINE,
                        ).group(1)
                    )
                except AttributeError:
                    issue["pages"] = 0
                issues.append(issue)
                # print(issue)
                time.sleep(0.2)
            # Increment the startIdx
            start += n
            # Set n to the number of results on the current page
            n = len(details)
            pbar.update(n)
    return issues


def save_ocr(issues, obj_id, title=None, output_dir="directories"):
    """
    Download the OCRd text for each page in each issue.
    """
    if not title:
        title = issues[0]["title"]
    # output_path = os.path.join(output_dir, '{}-{}'.format(slugify(title)[:50], obj_id))
    # output_path = Path(output_dir, f"{slugify(title)[:50]}-{obj_id}")
    # output_path.mkdir(exist_ok=True)
    # texts_path = os.path.join(output_path, 'texts')
    # os.makedirs(texts_path, exist_ok=True)
    for issue in tqdm(issues, desc="Texts", leave=False):
        # Default values
        if issue["pages"] != 0:
            issue_name = f'{slugify(issue["title"])[:50]}-{slugify(issue["details"])}-{issue["id"]}'
            issue_path = Path(output_dir, issue_name)
            issue_path.mkdir(exist_ok=True, parents=True)
            for page in tqdm(range(0, issue["pages"])):
                # file_name = '{}-{}-{}.txt'.format(slugify(issue['title'])[:50], slugify(issue['details'])[:50], issue['id'])

                file_path = Path(issue_path, f"{issue_name}-page{page}.txt")
                # Check to see if the file has already been harvested
                if not file_path.exists():
                    url = f'https://trove.nla.gov.au/{issue["id"]}/download?downloadOption=ocr&firstPage={page}&lastPage={page}'
                    # print(url)
                    # Get the file
                    r = s.get(url, timeout=120)
                    # Check there was no error
                    if r.status_code == requests.codes.ok:
                        # Check that the file's not empty
                        r.encoding = "utf-8"
                        if len(r.text) > 0 and not r.text.isspace():
                            # Check that the file isn't HTML (some not found pages don't return 404s)
                            if (
                                BeautifulSoup(r.text, "html.parser").find("html")
                                is None
                            ):
                                # If everything's ok, save the file
                                with open(
                                    file_path, "w", encoding="utf-8"
                                ) as text_file:
                                    text_file.write(r.text)
                time.sleep(1)

issues = harvest_metadata("nla.obj-522689844")

save_ocr(issues, "nla.obj-522689844")

def get_work_metadata(obj_id):
    """
    Extracts metadata embedded as a JSON string in a work's HTML page.
    See: https://glam-workbench.net/trove-books/metadata-for-digital-works/
    """
    # Get the HTML page
    response = requests.get(f"https://nla.gov.au/{obj_id}")
    # Search for the JSON string using regex
    try:
        work_data = re.search(
            r"var work = JSON\.parse\(JSON\.stringify\((\{.*\})", response.text
        ).group(1)
    except AttributeError:
        # Just in case it's not there...
        work_data = "{}"
        print("No data found!")
    # Return the JSON data
    return json.loads(work_data)

db_path = Path("datasette-test")
db_path.mkdir(exist_ok=True)

# This the basic metadata file that will be used by Datasette
# The processing steps below will add details for each table/volume into the metadata file
# Obviously you'd modify this for a different publication!

metadata = {
    "title": "New South Wales Post Office Directories",
    "description_html": "<p>This is an experimental interface to the NSW Post Office Directories, which have been digitised from the collections of the State Library of NSW and are <a href='https://nla.gov.au/nla.obj-518308191'>now available on Trove</a>.</p><p>Searching for entries in the directories on Trove is difficult, because Trove's results are grouped by 'article', which is not very helpful for publications like these where information is organised by row. This interface searches for individual lines of text, rather than pages or articles. So it points you straight to entries of interest. Once you've found something, you can view the entry within the context of the complete page, or click back to Trove to explore further.</p><p>You can currently search across 15 volumes from 1886 to 1908.",
    "databases": {
        "post-office-directories": {
            "title": "New South Wales Post Office Directories, 1886 to 1908",
            "source": "Trove",
            "source_url": "https://nla.gov.au/nla.obj-518308191",
            "tables": {},
        }
    },
}

# Create the database
# Change the name as apporpriate!
db = Database(Path(db_path, "post-office-directories.db"))

# Create database tables for pages and volumes
page_table = db["pages"]
vols_table = db["volumes"]

# Loop through the directories of each volume created by the harvesting process (above)
# Use natsorted so that they're processed in date order
vols = natsorted(
    [
        d
        for d in Path("directories").glob(
            "the-new-south-wales-post-office-directory-collection*"
        )
        if d.is_dir()
    ]
)

for vol in vols:
    print(vol)

    # In the case of the PO Directories each volume has a different year (or year range) in the title.
    # Here we're extracting the year and id, but extracting the year in this way might not work for other titles.
    # The year is used as the title of the table in Datasette
    year, obj_id = re.search(r"-([0-9\-]+?)-(nla.obj-\d+)", vol.name).groups()

    # Add a record for this volume to the database
    vols_table.insert({"vol_id": obj_id, "year": year}, pk="vol_id")

    # Get the embedded JSON data for this volume and extract a list of pages
    work_data = get_work_metadata(obj_id)
    work_pages = work_data["children"]["page"]

    # Get the title from the work metadata
    title = work_data["title"]

    # Update the metadata file with details of this volume
    metadata["databases"]["post-office-directories"]["tables"][year] = {
        "title": f"{title} {year}",
        "source": "Trove",
        "source_url": f"https://nla.gov.au/{obj_id}",
        "searchmode": "raw",
    }

    # Create a table for this volume. For the PO directories I'm using the year as the table name.
    # If year isn't available, some other way of naming the table would be necessary, such as the full title.
    vol_table = db[year]

    # Loop through all the text page by page for this volume
    pages = natsorted([p for p in vol.glob("*") if p.is_file()], alg=ns.PATH)
    for page in pages:
        lines = []

        # Insert details about this page into the pages table
        page_num = int(re.search(r"-page(\d+)$", page.name).group(1))
        page_table.insert(
            {
                "vol_id": obj_id,
                "page_id": work_pages[page_num]["pid"],
                "page": page_num,
            },
            pk=("vol_id", "page"),
            foreign_keys=[("vol_id", "volumes")],
        )

        # Open the text file and loop through the lines
        with page.open("r") as txt:
            for num, line in enumerate(txt):
                # Get rid of blank lines
                line = line.replace("\n", "")
                # If line is not blank, add details to a list of lines from this page
                if line:
                    lines.append(
                        {
                            "page": page_num,
                            "line": num + 1,
                            "text": line,
                            "page_id": work_pages[page_num]["pid"],
                        }
                    )
        # Insert all the lines from this page into the db
        vol_table.insert_all(
            lines, pk=("page", "line"), foreign_keys=[("page_id", "pages", "page_id")]
        )
    # Add a full text index on the line text
    vol_table.enable_fts(["text"])

# Save the updated metadata file
with open(Path(db_path, "metadata.json"), "w") as md_file:
    json.dump(metadata, md_file, indent=4)

datasette post-office-directories.db -m metadata.json

Create a database to search across each line of text in a series of volumes¶

Get the data!¶

Assemble the database!¶

Opening the db in Datasette¶