Enrich the list of periodicals from the Trove API¶

In Periodicals from /magazine/titles endpoint I harvested details of digitised periodicals and issues from the Trove API, removed duplicate titles, removed Parliamentary Papers, and tried to find missing issues. I noted that there were still some problems – in particular, some title links actually went to issues, and vice versa. This notebook tries to fix those problems and enriches the harvested data by extracting additional information from the website. It creates two datasets – one for titles and one for issues – and loads these into an SQLite database for use with Datasette Lite.

In [35]:
# Let's import the libraries we need.
import json
import os
import re
import time
from datetime import timedelta
from pathlib import Path

import arrow
import pandas as pd
import requests_cache
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from humanize import naturalsize
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from slugify import slugify
from sqlite_utils import Database

# from slugify import slugify
from tqdm.auto import tqdm

s = requests_cache.CachedSession(expire_after=timedelta(days=30))
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

load_dotenv()
Out[35]:
True

Check and enrich metadata¶

In [3]:
def get_metadata(id):
    """
    Extract work data in a JSON string from the work's HTML page.
    """
    if not id.startswith("http"):
        id = "https://nla.gov.au/" + id
    response = s.get(id)
    try:
        work_data = re.search(
            r"var work = JSON\.parse\(JSON\.stringify\((\{.*\})", response.text
        ).group(1)
    except AttributeError:
        work_data = "{}"
    if not response.from_cache:
        time.sleep(0.2)
    return json.loads(work_data)


def get_pages(work):
    """
    Get the number of pages from the work metadata.
    """
    try:
        pages = len(work["children"]["page"])
    except KeyError:
        pages = 0
    return pages


def get_title_ids():
    return [json.loads(t)["id"] for t in open("titles-issues-added.ndjson")]


def get_iso_date(date):
    if date:
        iso_date = arrow.get(date, "ddd, D MMM YYYY").format("YYYY-MM-DD")
    else:
        iso_date = ""
    return iso_date


def get_issues(parent_id):
    """
    Get the ids of issues that are children of the current record.
    """
    start_url = "https://nla.gov.au/{}/browse?startIdx={}&rows=20&op=c"
    # The initial startIdx value
    start = 0
    # Number of results per page
    n = 20
    parts = []
    # If there aren't 20 results on the page then we've reached the end, so continue harvesting until that happens.
    while n == 20:
        # Get the browse page
        response = s.get(start_url.format(parent_id, start))
        # Beautifulsoup turns the HTML into an easily navigable structure
        soup = BeautifulSoup(response.text, "lxml")
        # Find all the divs containing issue details and loop through them
        details = soup.find_all(class_="l-item-info")
        for detail in details:
            title = detail.find("h3")
            if title:
                issue_id = title.parent["href"].strip("/")
            else:
                issue_id = detail.find("a")["href"].strip("/")
            # Get the issue id
            parts.append(issue_id)
        if not response.from_cache:
            time.sleep(0.2)
        # Increment the startIdx
        start += n
        # Set n to the number of results on the current page
        n = len(details)
    return parts


def prepare_title(metadata, issue_count=0):
    """
    Create a periodical title record using metadata scraped from the digital viewer.
    """
    title = {
        "id": metadata.get("pid"),
        "title": metadata.get("title"),
        "description": metadata.get("subUnitNo", ""),
        "extent": metadata.get("extent", ""),
        "publisher": metadata.get("publisherName", ""),
        "issue_count": issue_count,
    }
    if bib_id := metadata.get("bibId"):
        title["catalogue_url"] = "https://nla.gov.au/nla.cat-vn" + bib_id
    return title


def prepare_issue(title, metadata, pages):
    """
    Create a periodical issue record using metadata scraped from the digital viewer.
    """
    pid = metadata.get("pid")
    issue = {
        "id": pid,
        "title_id": title.get("pid"),
        "title": title.get("title"),
        "description": metadata.get("subUnitNo", ""),
        "date": get_iso_date(metadata.get("issueDate", "")),
        "url": "https://nla.gov.au/" + pid,
        "pages": pages,
    }
    return issue


def enrich_periodicals_data(
    input_file="titles-issues-added.ndjson",
    output_titles="titles-enriched.ndjson",
    output_issues="issues-enriched.ndjson",
):
    """
    Work through all the titles and issues harvested from the API, checking that
    they are what they're supposed to be. Where titles are issues, or vice versa,
    try to add them to the right list.

    Also add extra metadata scraped from the digital viewer to title and issue records.
    """
    # Prepare output files
    titles_ndjson = Path(output_titles)
    titles_ndjson.unlink(missing_ok=True)
    issues_ndjson = Path(output_issues)
    issues_ndjson.unlink(missing_ok=True)

    # Get a list of current title ids to check against
    title_ids = get_title_ids()

    total = sum(1 for _ in open(input_file))

    # Loop through current titles
    with Path(input_file).open("r") as ndjson_in:
        for line in tqdm(ndjson_in, total=total):
            title_is_parent = True
            title = json.loads(line)
            # Scrape metadata from digital viewer
            title_metadata = get_metadata(title["id"])
            # If this has pages then it's actually an issue
            # So we'll try and get issue info
            if title_pages := get_pages(title_metadata):
                # Does it have a parent title?
                parent_metadata = title_metadata.get("parent")
                # If it does have a parent title and we don't have the title already,
                # save the title record
                if parent_metadata and parent_metadata["pid"] not in title_ids:
                    new_title = prepare_title(parent_metadata)
                    with titles_ndjson.open("a") as titles_out:
                        titles_out.write(f"{json.dumps(new_title)}\n")
                else:
                    print(title)
                # Create an issue record
                new_issue = prepare_issue(parent_metadata, title_metadata, title_pages)
                with issues_ndjson.open("a") as issues_out:
                    issues_out.write(f"{json.dumps(new_issue)}\n")

            # If it is really a title, we'll create a new title record that combines
            # the original record with the scraped metadata
            else:
                updated_title = title | prepare_title(title_metadata)
            # Clean up a few fields
            updated_title["issue_count"] = updated_title.get("new_issue_count", 0)
            updated_title.pop("new_issue_count", None)
            updated_title.pop("unknown_dates", None)
            issues = title.get("issues", [])
            updated_title.pop("issues", None)

            # Loop through the issues associated with this title
            with issues_ndjson.open("a") as issues_out:
                for issue in issues:
                    issue_metadata = get_metadata(issue["id"])
                    issue_pages = get_pages(issue_metadata)
                    # If it doesn't have pages then it's probably a title
                    if not issue_pages:
                        # Try scraping a list of issues from the viewer
                        parts = get_issues(issue["id"])
                        # If it has issues, then we'll treat it like a title
                        if parts:
                            title_is_parent = False
                            # If we don't already have this as a title, then add it
                            if issue["id"] not in title_ids:
                                new_title = prepare_title(issue_metadata, len(parts))
                                with titles_ndjson.open("a") as titles_out:
                                    titles_out.write(f"{json.dumps(new_title)}\n")
                        # Add all the issues belonging to this title
                        for part in parts:
                            part_metadata = get_metadata(part)
                            part_pages = get_pages(part_metadata)
                            new_issue = prepare_issue(
                                issue_metadata, part_metadata, part_pages
                            )
                            issues_out.write(f"{json.dumps(new_issue)}\n")
                    # If it is an issue, create a new record that adds in the scraped metadata and number of pages
                    else:
                        updated_issue = issue | prepare_issue(
                            title_metadata, issue_metadata, issue_pages
                        )
                        issues_out.write(f"{json.dumps(updated_issue)}\n")

            # If it really is a title (and not an issue) write it to the dataset
            if title_is_parent:
                with titles_ndjson.open("a") as titles_out:
                    titles_out.write(f"{json.dumps(updated_title)}\n")
In [ ]:
enrich_periodicals_data()

Create issues dataset¶

In [49]:
def save_issues(
    input_file="issues-enriched.ndjson", output_file="periodical-issues.csv"
):
    df = pd.read_json(input_file, lines=True)
    df.drop_duplicates(inplace=True)

    # Remove where id is duplicated and id = title_id
    df = df.loc[~((df.duplicated("id")) & (df["id"] == df["title_id"]))]

    def add_download_link(row):
        last_page = row["pages"] - 1
        return f"https://trove.nla.gov.au/{row['id']}/download?downloadOption=ocr&firstPage=0&lastPage={last_page}"

    # Add a link to download the complete issue text
    df["text_download_url"] = df.apply(add_download_link, axis=1)

    # Save as CSV
    df.sort_values(["title", "date"]).to_csv(output_file, index=False)

    # Add thumbnail details in JSON for Datasette
    df.insert(
        0,
        "thumbnail",
        df["url"].apply(
            lambda x: f'{{"img_src": "{x + "-t"}"}}' if not pd.isnull(x) else ""
        ),
    )
    return df
In [50]:
df_issues = save_issues()

Create titles dataset¶

In [71]:
def merge_lists(column):
    try:
        return column.apply(lambda x: "|".join(x) if isinstance(x, list) else x)
    except AttributeError:
        return column


def add_download_link(row, json=True):
    if row["issue_count"] > 0:
        title = slugify(row["title"])[:50]
        url = f"https://trove-journals.s3.ap-southeast-2.amazonaws.com/{title}-{row['title_id']}.zip"
        response = s.head(url)
        size = int(response.headers["Content-Length"])
        return (
            f'{{"href": "{url}", "label": "Download text ({naturalsize(size)} zip)"}}'
        )
    return ""


def save_titles(
    df_issues, input_file="titles-enriched.ndjson", output_file="periodical-titles.csv"
):
    df = pd.read_json(input_file, lines=True)
    df = df.apply(merge_lists)
    df = df.sort_values("issue_count").drop_duplicates(["id"], keep="last")

    # Add thumbnail details from issues
    df = pd.merge(
        df,
        df_issues.sort_values("date")
        .groupby("title_id")
        .head(1)[["title_id", "thumbnail"]],
        how="left",
        left_on="id",
        right_on="title_id",
    )
    # Add a url that will search for articles in the periodical
    df["search_url"] = df["id"].apply(
        lambda x: f'{{"href": "https://trove.nla.gov.au/search/category/magazines?keyword=%22{x}%22", "label": "Search for articles in Trove"}}'
    )

    # Add a link to a zip file containing the OCRd text of this title
    df["download_text"] = df.apply(add_download_link, axis=1)

    # Clean up column names
    df.rename(
        columns={
            "troveUrl": "trove_url",
            "startDate": "start_date",
            "endDate": "end_date",
        },
        inplace=True,
    )

    # Sort columns
    df = df[
        [
            "thumbnail",
            "id",
            "title",
            "description",
            "publisher",
            "trove_url",
            "search_url",
            "download_text",
            "issue_count",
            "start_date",
            "end_date",
            "start_year",
            "end_year",
            "extent",
            "place",
            "issn",
            "catalogue_url",
        ]
    ]

    # Make sure numbers are integers
    df["issue_count"] = df["issue_count"].astype("Int64")
    df["start_year"] = df["start_year"].astype("Int64")
    df["end_year"] = df["end_year"].astype("Int64")

    # Save data to CSV
    df_csv = df.copy()
    # Extract the url for text downloads
    df_csv["download_text"] = df["download_text"].apply(
        lambda x: json.loads(x)["href"] if x else ""
    )
    # Remove thumbnail and search_url and save as CSV
    df_csv.drop(columns=["thumbnail", "search_url"]).sort_values("title").to_csv(
        output_file, index=False
    )

    return df
In [72]:
df_titles = save_titles(df_issues)

Create an SQLite database¶

In [34]:
db = Database("periodicals.db", recreate=True)

db["titles"].insert_all(df_titles.to_dict(orient="records"), pk="id")
db["titles"].enable_fts(["title", "publisher"])

df_issues = df_issues.drop("title", axis=1)
db["issues"].insert_all(df_issues.to_dict(orient="records"), pk="id")
db["issues"].add_foreign_key("title_id", "titles", "id")
Out[34]:
<Table issues (thumbnail, id, title_id, description, date, url, pages, text_download_url)>
In [10]:
# IGNORE THIS CELL -- FOR TESTING ONLY
if os.getenv("GW_STATUS") == "dev":
    df_test = pd.read_json("titles-issues-added.ndjson", lines=True)
    df_test.loc[(df_test["issn"].notnull()) & (df_test["new_issue_count"] < 20)][
        :10
    ].to_json("test.ndjson", orient="records", lines=True)
    enrich_periodicals_data(
        input_file="test.ndjson",
        output_titles="titles-test.ndjson",
        output_issues="issues-test.ndjson",
    )
    df_issues = save_issues(
        input_file="issues-test.ndjson", output_file="issues-test.csv"
    )
    df_titles = save_titles(
        df_issues, input_file="titles-test.ndjson", output_file="titles-test.csv"
    )
    Path("test.ndjson").unlink()
    Path("issues-test.ndjson").unlink()
    Path("titles-test.ndjson").unlink()
    Path("issues-test.csv").unlink()
    Path("titles-test.csv").unlink()
  0%|          | 0/10 [00:00<?, ?it/s]

Created by Tim Sherratt for the GLAM Workbench.