Enrich the list of periodicals from the Trove API¶
In Periodicals from /magazine/titles endpoint I harvested details of digitised periodicals and issues from the Trove API, removed duplicate titles, removed Parliamentary Papers, and tried to find missing issues. I noted that there were still some problems – in particular, some title links actually went to issues, and vice versa. This notebook tries to fix those problems and enriches the harvested data by extracting additional information from the website. It creates two datasets – one for titles and one for issues – and loads these into an SQLite database for use with Datasette Lite.
In [35]:
# Let's import the libraries we need.
import json
import os
import re
import time
from datetime import timedelta
from pathlib import Path
import arrow
import pandas as pd
import requests_cache
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from humanize import naturalsize
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from slugify import slugify
from sqlite_utils import Database
# from slugify import slugify
from tqdm.auto import tqdm
s = requests_cache.CachedSession(expire_after=timedelta(days=30))
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))
load_dotenv()
Out[35]:
True
Check and enrich metadata¶
In [3]:
def get_metadata(id):
"""
Extract work data in a JSON string from the work's HTML page.
"""
if not id.startswith("http"):
id = "https://nla.gov.au/" + id
response = s.get(id)
try:
work_data = re.search(
r"var work = JSON\.parse\(JSON\.stringify\((\{.*\})", response.text
).group(1)
except AttributeError:
work_data = "{}"
if not response.from_cache:
time.sleep(0.2)
return json.loads(work_data)
def get_pages(work):
"""
Get the number of pages from the work metadata.
"""
try:
pages = len(work["children"]["page"])
except KeyError:
pages = 0
return pages
def get_title_ids():
return [json.loads(t)["id"] for t in open("titles-issues-added.ndjson")]
def get_iso_date(date):
if date:
iso_date = arrow.get(date, "ddd, D MMM YYYY").format("YYYY-MM-DD")
else:
iso_date = ""
return iso_date
def get_issues(parent_id):
"""
Get the ids of issues that are children of the current record.
"""
start_url = "https://nla.gov.au/{}/browse?startIdx={}&rows=20&op=c"
# The initial startIdx value
start = 0
# Number of results per page
n = 20
parts = []
# If there aren't 20 results on the page then we've reached the end, so continue harvesting until that happens.
while n == 20:
# Get the browse page
response = s.get(start_url.format(parent_id, start))
# Beautifulsoup turns the HTML into an easily navigable structure
soup = BeautifulSoup(response.text, "lxml")
# Find all the divs containing issue details and loop through them
details = soup.find_all(class_="l-item-info")
for detail in details:
title = detail.find("h3")
if title:
issue_id = title.parent["href"].strip("/")
else:
issue_id = detail.find("a")["href"].strip("/")
# Get the issue id
parts.append(issue_id)
if not response.from_cache:
time.sleep(0.2)
# Increment the startIdx
start += n
# Set n to the number of results on the current page
n = len(details)
return parts
def prepare_title(metadata, issue_count=0):
"""
Create a periodical title record using metadata scraped from the digital viewer.
"""
title = {
"id": metadata.get("pid"),
"title": metadata.get("title"),
"description": metadata.get("subUnitNo", ""),
"extent": metadata.get("extent", ""),
"publisher": metadata.get("publisherName", ""),
"issue_count": issue_count,
}
if bib_id := metadata.get("bibId"):
title["catalogue_url"] = "https://nla.gov.au/nla.cat-vn" + bib_id
return title
def prepare_issue(title, metadata, pages):
"""
Create a periodical issue record using metadata scraped from the digital viewer.
"""
pid = metadata.get("pid")
issue = {
"id": pid,
"title_id": title.get("pid"),
"title": title.get("title"),
"description": metadata.get("subUnitNo", ""),
"date": get_iso_date(metadata.get("issueDate", "")),
"url": "https://nla.gov.au/" + pid,
"pages": pages,
}
return issue
def enrich_periodicals_data(
input_file="titles-issues-added.ndjson",
output_titles="titles-enriched.ndjson",
output_issues="issues-enriched.ndjson",
):
"""
Work through all the titles and issues harvested from the API, checking that
they are what they're supposed to be. Where titles are issues, or vice versa,
try to add them to the right list.
Also add extra metadata scraped from the digital viewer to title and issue records.
"""
# Prepare output files
titles_ndjson = Path(output_titles)
titles_ndjson.unlink(missing_ok=True)
issues_ndjson = Path(output_issues)
issues_ndjson.unlink(missing_ok=True)
# Get a list of current title ids to check against
title_ids = get_title_ids()
total = sum(1 for _ in open(input_file))
# Loop through current titles
with Path(input_file).open("r") as ndjson_in:
for line in tqdm(ndjson_in, total=total):
title_is_parent = True
title = json.loads(line)
# Scrape metadata from digital viewer
title_metadata = get_metadata(title["id"])
# If this has pages then it's actually an issue
# So we'll try and get issue info
if title_pages := get_pages(title_metadata):
# Does it have a parent title?
parent_metadata = title_metadata.get("parent")
# If it does have a parent title and we don't have the title already,
# save the title record
if parent_metadata and parent_metadata["pid"] not in title_ids:
new_title = prepare_title(parent_metadata)
with titles_ndjson.open("a") as titles_out:
titles_out.write(f"{json.dumps(new_title)}\n")
else:
print(title)
# Create an issue record
new_issue = prepare_issue(parent_metadata, title_metadata, title_pages)
with issues_ndjson.open("a") as issues_out:
issues_out.write(f"{json.dumps(new_issue)}\n")
# If it is really a title, we'll create a new title record that combines
# the original record with the scraped metadata
else:
updated_title = title | prepare_title(title_metadata)
# Clean up a few fields
updated_title["issue_count"] = updated_title.get("new_issue_count", 0)
updated_title.pop("new_issue_count", None)
updated_title.pop("unknown_dates", None)
issues = title.get("issues", [])
updated_title.pop("issues", None)
# Loop through the issues associated with this title
with issues_ndjson.open("a") as issues_out:
for issue in issues:
issue_metadata = get_metadata(issue["id"])
issue_pages = get_pages(issue_metadata)
# If it doesn't have pages then it's probably a title
if not issue_pages:
# Try scraping a list of issues from the viewer
parts = get_issues(issue["id"])
# If it has issues, then we'll treat it like a title
if parts:
title_is_parent = False
# If we don't already have this as a title, then add it
if issue["id"] not in title_ids:
new_title = prepare_title(issue_metadata, len(parts))
with titles_ndjson.open("a") as titles_out:
titles_out.write(f"{json.dumps(new_title)}\n")
# Add all the issues belonging to this title
for part in parts:
part_metadata = get_metadata(part)
part_pages = get_pages(part_metadata)
new_issue = prepare_issue(
issue_metadata, part_metadata, part_pages
)
issues_out.write(f"{json.dumps(new_issue)}\n")
# If it is an issue, create a new record that adds in the scraped metadata and number of pages
else:
updated_issue = issue | prepare_issue(
title_metadata, issue_metadata, issue_pages
)
issues_out.write(f"{json.dumps(updated_issue)}\n")
# If it really is a title (and not an issue) write it to the dataset
if title_is_parent:
with titles_ndjson.open("a") as titles_out:
titles_out.write(f"{json.dumps(updated_title)}\n")
In [ ]:
enrich_periodicals_data()
Create issues dataset¶
In [49]:
def save_issues(
input_file="issues-enriched.ndjson", output_file="periodical-issues.csv"
):
df = pd.read_json(input_file, lines=True)
df.drop_duplicates(inplace=True)
# Remove where id is duplicated and id = title_id
df = df.loc[~((df.duplicated("id")) & (df["id"] == df["title_id"]))]
def add_download_link(row):
last_page = row["pages"] - 1
return f"https://trove.nla.gov.au/{row['id']}/download?downloadOption=ocr&firstPage=0&lastPage={last_page}"
# Add a link to download the complete issue text
df["text_download_url"] = df.apply(add_download_link, axis=1)
# Save as CSV
df.sort_values(["title", "date"]).to_csv(output_file, index=False)
# Add thumbnail details in JSON for Datasette
df.insert(
0,
"thumbnail",
df["url"].apply(
lambda x: f'{{"img_src": "{x + "-t"}"}}' if not pd.isnull(x) else ""
),
)
return df
In [50]:
df_issues = save_issues()
Create titles dataset¶
In [71]:
def merge_lists(column):
try:
return column.apply(lambda x: "|".join(x) if isinstance(x, list) else x)
except AttributeError:
return column
def add_download_link(row, json=True):
if row["issue_count"] > 0:
title = slugify(row["title"])[:50]
url = f"https://trove-journals.s3.ap-southeast-2.amazonaws.com/{title}-{row['title_id']}.zip"
response = s.head(url)
size = int(response.headers["Content-Length"])
return (
f'{{"href": "{url}", "label": "Download text ({naturalsize(size)} zip)"}}'
)
return ""
def save_titles(
df_issues, input_file="titles-enriched.ndjson", output_file="periodical-titles.csv"
):
df = pd.read_json(input_file, lines=True)
df = df.apply(merge_lists)
df = df.sort_values("issue_count").drop_duplicates(["id"], keep="last")
# Add thumbnail details from issues
df = pd.merge(
df,
df_issues.sort_values("date")
.groupby("title_id")
.head(1)[["title_id", "thumbnail"]],
how="left",
left_on="id",
right_on="title_id",
)
# Add a url that will search for articles in the periodical
df["search_url"] = df["id"].apply(
lambda x: f'{{"href": "https://trove.nla.gov.au/search/category/magazines?keyword=%22{x}%22", "label": "Search for articles in Trove"}}'
)
# Add a link to a zip file containing the OCRd text of this title
df["download_text"] = df.apply(add_download_link, axis=1)
# Clean up column names
df.rename(
columns={
"troveUrl": "trove_url",
"startDate": "start_date",
"endDate": "end_date",
},
inplace=True,
)
# Sort columns
df = df[
[
"thumbnail",
"id",
"title",
"description",
"publisher",
"trove_url",
"search_url",
"download_text",
"issue_count",
"start_date",
"end_date",
"start_year",
"end_year",
"extent",
"place",
"issn",
"catalogue_url",
]
]
# Make sure numbers are integers
df["issue_count"] = df["issue_count"].astype("Int64")
df["start_year"] = df["start_year"].astype("Int64")
df["end_year"] = df["end_year"].astype("Int64")
# Save data to CSV
df_csv = df.copy()
# Extract the url for text downloads
df_csv["download_text"] = df["download_text"].apply(
lambda x: json.loads(x)["href"] if x else ""
)
# Remove thumbnail and search_url and save as CSV
df_csv.drop(columns=["thumbnail", "search_url"]).sort_values("title").to_csv(
output_file, index=False
)
return df
In [72]:
df_titles = save_titles(df_issues)
Create an SQLite database¶
In [34]:
db = Database("periodicals.db", recreate=True)
db["titles"].insert_all(df_titles.to_dict(orient="records"), pk="id")
db["titles"].enable_fts(["title", "publisher"])
df_issues = df_issues.drop("title", axis=1)
db["issues"].insert_all(df_issues.to_dict(orient="records"), pk="id")
db["issues"].add_foreign_key("title_id", "titles", "id")
Out[34]:
<Table issues (thumbnail, id, title_id, description, date, url, pages, text_download_url)>
In [10]:
# IGNORE THIS CELL -- FOR TESTING ONLY
if os.getenv("GW_STATUS") == "dev":
df_test = pd.read_json("titles-issues-added.ndjson", lines=True)
df_test.loc[(df_test["issn"].notnull()) & (df_test["new_issue_count"] < 20)][
:10
].to_json("test.ndjson", orient="records", lines=True)
enrich_periodicals_data(
input_file="test.ndjson",
output_titles="titles-test.ndjson",
output_issues="issues-test.ndjson",
)
df_issues = save_issues(
input_file="issues-test.ndjson", output_file="issues-test.csv"
)
df_titles = save_titles(
df_issues, input_file="titles-test.ndjson", output_file="titles-test.csv"
)
Path("test.ndjson").unlink()
Path("issues-test.ndjson").unlink()
Path("titles-test.ndjson").unlink()
Path("issues-test.csv").unlink()
Path("titles-test.csv").unlink()
0%| | 0/10 [00:00<?, ?it/s]
Created by Tim Sherratt for the GLAM Workbench.