Harvest details of Commonwealth Parliamentary Papers digitised in Trove¶
Trove includes thousands of digitised papers and reports presented to the Commonwealth Parliament. The Trove Data Guide provides an overview of the Parliamentary Papers digitised in Trove.
However, finding all the Parliamentary Papers is not straightforward because of inconsistencies in the way they've been arranged and described. This notebook attempts to work around these problems and harvest as complete as possible data about Parliamentary Papers in Trove.
The basic strategy is to harvest as many records as possible, and then merge any duplicates at the end. There are 4 main steps:
- search for digitised Parliamentary Papers using the
/result
API endpoint and work through all the grouped versions in each work record, saving all that are relevant – this will expand and separate wrongly-grouped records so they can be individually harvested - enrich and expand the version records by extracting embedded metadata from the digitised item viewer – this gets the number of pages for publications, and extracts individual publication details from nested collections
- check for missing parent publications – some records, such as sections extracted from a Parliamentary Paper, will have parent publications, this step makes sure we've got a record for each parent
- merge duplicate and semi-duplicate records – de-duplicate based on unique fields (ie the link to the digitised item), and merge the values of other fields so that no metadata is lost
It should be noted that this method takes a long time and is very inefficient. The main reason for this is that a search for Parliamentary Papers returns more than 250,000 records. Most of these records are sections (or 'articles') extracted from Parliamentary Papers and delivered through the Magazines & Newsletters category. However, there doesn't seem to be a reliable way of distinguishing between these 'articles' and complete publications based on the API metadata alone. The 'articles' are identified in the code below using the embedded metadata extracted from the digitised file viewer, and excluded at the merge stage.
If you you don't want to harvest all the metadata yourself, see Digitised Parliamentary Papers in Trove for a pre-harvested dataset.
Import what we need¶
import json
import os
import re
import time
from functools import reduce
from pathlib import Path
import pandas as pd
import requests
import requests_cache
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
load_dotenv()
s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))
# Insert your Trove API key between the quotes
API_KEY = "YOUR API KEY"
# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
API_KEY = os.getenv("TROVE_API_KEY")
Save search results and extract versions¶
First we search for Parliamentary Papers using the basic query "nla.obj" series:"Parliamentary paper (Australia. Parliament)"
. Instead of just saving each work record, we check each version grouped within the work to see if it points to a digitised resource – if it does, we add it to the dataset.
def get_total_results(params, headers):
"""
Get the total number of results for a search.
"""
these_params = params.copy()
these_params["n"] = 0
response = s.get(
"https://api.trove.nla.gov.au/v3/result", params=these_params, headers=headers
)
data = response.json()
return int(data["category"][0]["records"]["total"])
def get_value(record, field, keys=["value"]):
"""
Get the values of a field.
Some fields are lists of dicts, if so use the `key` to get the value.
"""
value = record.get(field, [])
if value and isinstance(value[0], dict):
for key in keys:
try:
return [re.sub(r"\s+", " ", v[key]) for v in value]
except KeyError:
pass
else:
return value
def merge_values(record, fields, keys=["value"]):
"""
Merges values from multiple fields, removing any duplicates.
"""
values = []
for field in fields:
values += get_value(record, field, keys)
# Remove duplicates and None value
return list(set([v for v in values if v is not None]))
def flatten_values(record, field, key="type"):
"""
If a field has a value and type, return the values as strings with this format: 'type: value'
"""
flattened = []
values = record.get(field, [])
for value in values:
if key in value:
flattened.append(f"{value[key]}: {value['value']}")
else:
flattened.append(value["value"])
return flattened
def flatten_identifiers(record):
"""
Get a list of control numbers from the identifier field and flatten the values.
"""
ids = {
"identifier": [
v
for v in record.get("identifier", [])
if "type" in v and v["type"] == "control number"
]
}
return flatten_values(ids, "identifier", "source")
def get_fulltext_url(links):
"""
Loop through the identifiers to find a link to the full text version of the book.
"""
urls = []
for link in links:
if (
"linktype" in link
and link["linktype"] == "fulltext"
and "nla.obj" in link["value"]
):
url = re.sub(r"^http\b", "https", link["value"])
link_text = link.get("linktext", "")
urls.append({"url": url, "link_text": link_text})
return urls
def get_catalogue_url(links):
"""
Loop through the identifiers to find a link to the NLA catalogue.
"""
for link in links:
if (
"linktype" in link
and link["linktype"] == "notonline"
and "nla.cat" in link["value"]
):
return link["value"]
return ""
def has_fulltext_link(links):
"""
Check if a list of identifiers includes a fulltext url pointing to an NLA resource.
"""
for link in links:
if (
"linktype" in link
and link["linktype"] == "fulltext"
and "nla.obj" in link["value"]
):
return True
def get_digitised_versions(work):
"""
Get the versions from the given work that have a fulltext url pointing to an NLA resource
in the `identifier` field.
"""
versions = []
for version in work["version"]:
if "identifier" in version and has_fulltext_link(version["identifier"]):
versions.append(version)
return versions
def harvest_works(params, output="pp-metadata.ndjson", max=None):
"""
Harvest metadata relating to digitised works.
"""
harvested = 0
default_params = {
"category": "all",
"bulkHarvest": "true",
"n": 100,
"encoding": "json",
"include": ["links", "workversions"],
}
params.update(default_params)
headers = {"X-API-KEY": API_KEY}
total = max if max else get_total_results(params, headers)
start = "*"
with Path(output).open("w") as ndjson_file:
with tqdm(total=total) as pbar:
while start:
params["s"] = start
response = s.get(
"https://api.trove.nla.gov.au/v3/result",
params=params,
headers=headers,
)
data = response.json()
items = data["category"][0]["records"]["item"]
for item in items:
for category, record in item.items():
# See if there's a link to the full text version.
if category == "work" and "identifier" in record:
versions = get_digitised_versions(record)
for version in versions:
for sub_version in version["record"]:
metadata = sub_version["metadata"]["dc"]
# Sometimes fulltext identifiers are only available on the
# version rather than the sub version. So we'll look in the
# sub version first, and if they're not there use the url from
# the version.
# Sometimes there are multiple fulltext urls associated with a version:
# eg a collection page and a publication. If so add records for both urls.
# They could end up pointing to the same digitised publication, but
# we can sort that out later. Aim here is to try and not miss any possible
# routes to digitised publications!
urls = get_fulltext_url(metadata["identifier"])
if len(urls) == 0:
urls = get_fulltext_url(version["identifier"])
for url in urls:
work = {
# This is not the full set of available fields,
# adjust as necessary.
"title": get_value(metadata, "title"),
"work_url": record.get("troveUrl"),
"work_type": record.get("type", []),
"contributor": merge_values(
metadata,
["creator", "contributor"],
["value", "name"],
),
"publisher": get_value(
metadata, "publisher"
),
"date": merge_values(
metadata, ["date", "issued"]
),
# Using merge here because I've noticed some duplicate values
"type": merge_values(metadata, ["type"]),
"format": get_value(metadata, "format"),
"rights": merge_values(
metadata, ["rights", "licenseRef"]
),
"language": get_value(metadata, "language"),
"extent": get_value(metadata, "extent"),
"subject": merge_values(
metadata, ["subject"]
),
# Flattened type/value
"is_part_of": flatten_values(
metadata, "isPartOf"
),
# Only get control numbers and flatten
"identifier": flatten_identifiers(metadata),
"fulltext_url": url["url"],
"fulltext_url_text": url["link_text"],
"catalogue_url": get_catalogue_url(
metadata["identifier"]
),
# Could also add in data from bibliographicCitation
# Although the types used in citations seem to vary by work and format.
}
ndjson_file.write(f"{json.dumps(work)}\n")
# The nextStart parameter is used to get the next page of results.
# If there's no nextStart then it means we're on the last page of results.
harvested += len(items)
if max and harvested >= max:
start = None
else:
try:
start = data["category"][0]["records"]["nextStart"]
except KeyError:
start = None
pbar.update(len(items))
params = {
"q": '"nla.obj" series:"Parliamentary paper (Australia. Parliament)"',
"l-availability": "y",
}
harvest_works(params)
How many records have we harvested so far? Note that the number harvested is greater than the number of search results. This is because we've unpacked versions that had been grouped into works and saved them as separate records.
count = 0
with Path("pp-metadata.ndjson").open() as ndjson:
for line in ndjson:
count += 1
count
Enrich and expand records using metadata from the digitised file viewer¶
The digitised file viewer usually embeds some additional metadata, including the publication's MARC record from the NLA catalogue! If the file viewer link points to a publication, rather than a collection, the metadata will include details of individual pages. This code saves the number of pages in a publication and adds some extra metadata. If the file viewer link points to a collection, this code will unpack the individual publications from the collection and add them to the dataset.
def get_work_data(url):
"""
Extract work data in a JSON string from the work's HTML page.
"""
try:
response = s.get(url)
except requests.exceptions.InvalidURL:
response = s.get(url.replace("\\\\", "//"))
try:
work_data = re.search(
r"var work = JSON\.parse\(JSON\.stringify\((\{.*\})", response.text
).group(1)
except AttributeError:
work_data = "{}"
if not response.from_cache:
time.sleep(0.2)
return json.loads(work_data)
def get_pages(work):
"""
Get the number of pages from the work metadata.
"""
try:
pages = len(work["children"]["page"])
except KeyError:
pages = 0
return pages
def get_page_ids(work):
"""
Get a list of page identifiers from the work metadata.
"""
try:
page_ids = [p["pid"] for p in work["children"]["page"]]
except KeyError:
page_ids = []
return page_ids
def get_volumes(parent_id):
"""
Get the ids of volumes that are children of the current record.
"""
start_url = "https://nla.gov.au/{}/browse?startIdx={}&rows=20&op=c"
# The initial startIdx value
start = 0
# Number of results per page
n = 20
parts = []
# If there aren't 20 results on the page then we've reached the end, so continue harvesting until that happens.
while n == 20:
# Get the browse page
response = s.get(start_url.format(parent_id, start))
# Beautifulsoup turns the HTML into an easily navigable structure
soup = BeautifulSoup(response.text, "lxml")
# Find all the divs containing issue details and loop through them
details = soup.find_all(class_="l-item-info")
for detail in details:
title = detail.find("h3")
if title:
issue_id = title.parent["href"].strip("/")
else:
issue_id = detail.find("a")["href"].strip("/")
# Get the issue id
parts.append(issue_id)
if not response.from_cache:
time.sleep(0.2)
# Increment the startIdx
start += n
# Set n to the number of results on the current page
n = len(details)
return parts
def add_metadata(work, metadata, pages, include_page_ids=False):
"""
Add embedded metadata to existing record.
New values will be appended to existing list.
"""
fields = [
{"to": "title", "from": "title"},
{"to": "contributor", "from": "creator"},
{"to": "publisher", "from": "publisherName"},
{"to": "format", "from": "form"},
{"to": "rights", "from": "copyrightPolicy"},
{"to": "extent", "from": "extent"},
{"to": "identifier", "from": "holdingNumber"},
]
for field in fields:
value_from = metadata.get(field["from"])
if value_from:
try:
if value_from not in work[field["to"]]:
work[field["to"]].append(metadata.get(field["from"]))
except KeyError:
work[field["to"]] = [metadata.get(field["from"])]
except AttributeError:
if value_from != work[field["to"]]:
work[field["to"]] = [work[field["to"]], metadata.get(field["from"])]
work["alternative_title"] = " ".join(
[
metadata.get("subUnitType", ""),
metadata.get("subUnitNo", ""),
]
).strip()
if date := re.search(r"\b(\d{4})$", metadata.get("issueDate", "")):
work["date"] = date.group(1)
work["pages"] = pages
if include_page_ids:
work["page_ids"] = get_page_ids(metadata)
return work
def enrich_records(
input="pp-metadata.ndjson",
output="pp-metadata-pages.ndjson",
include_page_ids=False,
):
"""
Add the number of pages to the metadata for each work.
Add volumes from multi volume books.
"""
total = sum(1 for _ in open(input))
with Path(input).open("r") as ndjson_in:
with Path(output).open("w") as ndjson_out:
for line in tqdm(ndjson_in, total=total):
work = json.loads(line)
# print(book['fulltext_url'])
metadata = get_work_data(work["fulltext_url"])
# Some ids are for sections (articles) rather than the complete publications
# ignore these as we should already have the complete publication.
trove_id = re.search(r"(nla\.obj\-\d+)", work["fulltext_url"]).group(1)
if trove_id == metadata.get("pid"):
form = metadata.get("form")
pages = get_pages(metadata)
work = add_metadata(work.copy(), metadata, pages, include_page_ids)
parent = metadata.get("parent", {})
if ppid := parent.get("pid"):
work["parent"] = ppid
work["parent_url"] = "https://nla.gov.au/" + ppid
work["children"] = ""
# If there's no pages its probably a collection,
# so we have to get the ids of each individual publication in the collection and process them
if pages == 0 and form in ["Multi Volume Book", "Journal"]:
# Get child volumes
volumes = get_volumes(trove_id)
# For each volume get details and add as a new book entry
for volume_id in volumes:
volume = {
# Use values from parent
# If there are additional values in embedded metadata,
# they'll be added by add_metadata() below
"format": work["format"],
"subject": work["subject"],
"language": work["language"],
"is_part_of": work["is_part_of"],
"identifier": work["identifier"],
# Add link up to parent
"parent": trove_id,
"parent_url": work["work_url"],
# Because this is a collection child it has no work url.
# If there's an individual record for this publication
# it'll be separately harvested and merged later.
"work_url": "",
"fulltext_url": "https://nla.gov.au/{}".format(
volume_id
),
}
metadata = get_work_data(volume["fulltext_url"])
pages = get_pages(metadata)
volume = add_metadata(
volume, metadata, pages, include_page_ids
)
# print(volume)
ndjson_out.write(f"{json.dumps(volume)}\n")
# Add links from container to volumes
work["children"] = "|".join(volumes)
else:
work["parent"] = metadata.get("pid", "")
work["parent_url"] = "https://nla.gov.au/" + metadata.get("pid", "")
# print(book)
ndjson_out.write(f"{json.dumps(work)}\n")
enrich_records()
count = 0
with Path("pp-metadata-pages.ndjson").open() as ndjson:
for line in ndjson:
count += 1
count
Check for missing parent records¶
As noted, this method harvests sections extracted from Parliamentary Papers as well as full publications. The parent publication of a section should have been identified in the previous processing step. Here we make sure that we have individual publication records for all of the parents (yes, sometimes they're missing).
def get_missing_metadata(input="pp-metadata-pages.ndjson", include_page_ids=False):
df = pd.read_json(input, lines=True, convert_dates=False)
parent_ids = list(df["parent"].unique())
fulltext_urls = list(df["fulltext_url"].unique())
fulltext_ids = [f.split("/")[-1] for f in fulltext_urls]
# Find parent ids that we don't have as individual records
missing_ids = [m for m in list(set(parent_ids) - set(fulltext_ids)) if m != ""]
with Path(input).open("a") as ndjson_out:
for mid in tqdm(missing_ids):
fulltext_url = f"https://nla.gov.au/{mid}"
metadata = get_work_data(fulltext_url)
work = {
"fulltext_url": fulltext_url,
}
pages = get_pages(metadata)
work = add_metadata(work, metadata, pages, include_page_ids)
ndjson_out.write(f"{json.dumps(work)}\n")
get_missing_metadata()
count = 0
with Path("pp-metadata-pages.ndjson").open() as ndjson:
for line in ndjson:
count += 1
count
Merge duplicate records¶
Because of the way we've unpacked grouped versions, it's possible we might have created duplicate records. In any case, Trove itself includes near-duplicate records for many digitised resources – they often point to the same digitised resource, but with slightly different metadata. To make sure we get as many of the Parliamentary Papers as possible, we've left the duplicates in the dataset until now. In this step we exclude collections and 'articles' by leaving out records without pages, and then merge the rest. The merge process de-duplicates records based on fields that only have a single unique value: fulltext_url
, pages
, alternative_title
. Other fields can contain multiple values, so these are merged and separated by a |
character.
def merge_column(columns):
values = []
for value in columns:
if isinstance(value, list):
values += [str(v) for v in value if v]
elif value:
values.append(str(value))
return "|".join(sorted(set(values)))
def merge_records(df):
df["pages"].fillna(0, inplace=True)
df.fillna("", inplace=True)
df["pages"] = df["pages"].astype("Int64")
# Add base dataset with columns that will always have only one value
# Only include records with pages (excludes sections of publications and collections)
dfs = [
df.loc[df["pages"] > 0][
["fulltext_url", "pages", "alternative_title"]
].drop_duplicates()
]
# Columns that potentially have multiple values which will be merged
columns = [
"title",
"work_url",
"work_type",
"contributor",
"publisher",
"date",
"type",
"format",
"extent",
"language",
"subject",
"is_part_of",
"identifier",
"rights",
"fulltext_url_text",
"catalogue_url",
"parent",
"parent_url",
"children",
]
# Merge values from each column in turn, creating a new dataframe from each
for column in columns:
dfs.append(
df.groupby(["fulltext_url"])[column].apply(merge_column).reset_index()
)
# Merge all the individual dataframes into one, linking on `fulltext_url` value
df_merged = reduce(
lambda left, right: pd.merge(left, right, on=["fulltext_url"], how="left"), dfs
)
return df_merged
df = pd.read_json("pp-metadata-pages.ndjson", lines=True, convert_dates=False)
df_merged = merge_records(df)
How many records are there now?
df_merged.shape[0]
Add a column that provides a link to download the OCRd text of the book.
def add_download_link(row):
trove_id = re.search(r"(nla\.obj\-\d+)", row["fulltext_url"]).group(1)
last_page = row["pages"] - 1
return f"https://trove.nla.gov.au/{trove_id}/download?downloadOption=ocr&firstPage=0&lastPage={last_page}"
df_merged["text_download_url"] = df_merged.apply(add_download_link, axis=1)
Save the final dataset as CSV and Parquet files.
dataset_columns = [
"title",
"alternative_title",
"contributor",
"publisher",
"date",
"type",
"format",
"extent",
"language",
"subject",
"is_part_of",
"identifier",
"rights",
"pages",
"fulltext_url",
"fulltext_url_text",
"text_download_url",
"catalogue_url",
"work_url",
"work_type",
"parent",
"parent_url",
"children",
]
df_merged[dataset_columns].to_csv("trove-parliamentary-papers.csv", index=False)
df_merged[dataset_columns].to_parquet("trove-parliamentary-papers.parquet", index=False)
# TESTING ONLY -- PLEASE IGNORE THIS CELL
if os.getenv("GW_STATUS") == "dev":
params = {
"q": '"nla.obj" series:"Parliamentary paper (Australia. Parliament)"',
"l-availability": "y",
}
harvest_works(params, output="test.ndjson", max=100)
enrich_records(input="test.ndjson", output="test-pages.ndjson")
get_missing_metadata(input="test-pages.ndjson")
df = pd.read_json("test-pages.ndjson", lines=True, convert_dates=False)
df_merged = merge_records(df)
assert not df_merged.empty
Path("test.ndjson").unlink()
Path("test-pages.ndjson").unlink()
Created by Tim Sherratt for the GLAM Workbench.