Harvest details of periodicals submitted to Trove through the National edeposit scheme (NED)¶
This notebook harvests details of periodicals submitted to Trove through the National edeposit scheme (NED). It creates two datasets, one containing details of the periodical titles, and the other listing all the available issues.
There are two main harvesting steps. The first is to search for periodicals using the API's /result
endpoint using the following parameters:
q
set to"nla.obj" nuc:"ANL:NED"
format
facet toPeriodical
- and
l-availability
set toy
The work records returned by this search are unpacked, and individual versions saved to make sure we get everything. Once this is complete, any duplicate records are merged.
The second step harvests details of issues by extracting a list of issues for each title from the collection viewer. It then supplements the issue metadata by extracting information for each issue from the journal viewer.
# Let's import the libraries we need.
import json
import os
import re
import time
from datetime import timedelta
from functools import reduce
from pathlib import Path
import arrow
import pandas as pd
import requests
import requests_cache
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from sqlite_utils import Database
from tqdm.auto import tqdm
r = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
r.mount("https://", HTTPAdapter(max_retries=retries))
r.mount("http://", HTTPAdapter(max_retries=retries))
s = requests_cache.CachedSession(expire_after=timedelta(days=30))
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))
load_dotenv()
True
Add your Trove API key¶
You can get a Trove API key by following these instructions.
# Insert your Trove API key
API_KEY = "YOUR API KEY"
# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
API_KEY = os.getenv("TROVE_API_KEY")
Define some functions to do the work¶
def get_total_results(params, headers):
"""
Get the total number of results for a search.
"""
these_params = params.copy()
these_params["n"] = 0
response = s.get(
"https://api.trove.nla.gov.au/v3/result", params=these_params, headers=headers
)
data = response.json()
return int(data["category"][0]["records"]["total"])
def get_value(record, field, keys=["value"]):
"""
Get the values of a field.
Some fields are lists of dicts, if so use the `key` to get the value.
"""
value = record.get(field, [])
if value and isinstance(value[0], dict):
for key in keys:
try:
return [re.sub(r"\s+", " ", v[key]) for v in value]
except KeyError:
pass
else:
return value
def merge_values(record, fields, keys=["value"]):
"""
Merges values from multiple fields, removing any duplicates.
"""
values = []
for field in fields:
values += get_value(record, field, keys)
# Remove duplicates and None value
return list(set([v for v in values if v is not None]))
def flatten_values(record, field, key="type"):
"""
If a field has a value and type, return the values as strings with this format: 'type: value'
"""
flattened = []
values = record.get(field, [])
for value in values:
if key in value:
flattened.append(f"{value[key]}: {value['value']}")
else:
flattened.append(value["value"])
return flattened
def flatten_identifiers(record):
"""
Get a list of control numbers from the identifier field and flatten the values.
"""
ids = {
"identifier": [
v
for v in record.get("identifier", [])
if "type" in v and v["type"] == "control number"
]
}
return flatten_values(ids, "identifier", "source")
def get_fulltext_url(links):
"""
Loop through the identifiers to find a link to the full text version of the book.
"""
urls = []
for link in links:
if (
"linktype" in link
and link["linktype"] == "fulltext"
and "nla.obj" in link["value"]
and "edeposit" in link.get("linktext", "")
):
url = re.sub(r"^http\b", "https", link["value"])
url = re.sub(r"^https://www\.", "https://", url)
link_text = link.get("linktext", "")
urls.append({"url": url, "link_text": link_text})
return urls
def get_catalogue_url(links):
"""
Loop through the identifiers to find a link to the NLA catalogue.
"""
for link in links:
if (
"linktype" in link
and link["linktype"] == "notonline"
and "nla.cat" in link["value"]
):
return link["value"]
return ""
def has_fulltext_link(links):
"""
Check if a list of identifiers includes a fulltext url pointing to an NLA resource.
"""
for link in links:
if (
"linktype" in link
and link["linktype"] == "fulltext"
and "nla.obj" in link["value"]
and "edeposit" in link.get("linktext", "")
):
return True
def has_holding(holdings, nucs):
"""
Check if a list of holdings includes one of the supplied nucs.
"""
for holding in holdings:
if holding.get("nuc") in nucs:
return True
def get_digitised_versions(work):
"""
Get the versions from the given work that have a fulltext url pointing to an NLA resource
in the `identifier` field.
"""
versions = []
for version in work["version"]:
if "identifier" in version and has_fulltext_link(version["identifier"]):
versions.append(version)
return versions
def get_nuc_versions(work, nucs=["ANL", "ANL:DL"]):
"""
Get the versions from the given work that are held by the NLA.
"""
versions = []
for version in work["version"]:
if "holding" in version and has_holding(version["holding"], ["ANL", "ANL:DL"]):
versions.append(version)
return versions
def harvest_works(
params,
filter_by="url",
nucs=["ANL", "ANL:DL"],
output_file="harvested-metadata.ndjson",
):
"""
Harvest metadata relating to digitised works.
The filter_by parameter selects records for inclusion in the dataset, options:
* url -- only include versions that have an NLA fulltext url
* nuc -- only include versions that have an NLA nuc (ANL or ANL:DL)
"""
default_params = {
"category": "all",
"bulkHarvest": "true",
"n": 100,
"encoding": "json",
"include": ["links", "workversions", "holdings"],
}
params.update(default_params)
headers = {"X-API-KEY": API_KEY}
total = get_total_results(params, headers)
start = "*"
with Path(output_file).open("w") as ndjson_file:
with tqdm(total=total) as pbar:
while start:
params["s"] = start
response = r.get(
"https://api.trove.nla.gov.au/v3/result",
params=params,
headers=headers,
)
data = response.json()
items = data["category"][0]["records"]["item"]
for item in items:
for category, record in item.items():
if category == "work":
if filter_by == "nuc":
versions = get_nuc_versions(record, nucs)
else:
versions = get_digitised_versions(record)
# Sometimes there are fulltext links on work but not versions
if len(versions) == 0 and has_fulltext_link(
record["identifier"]
):
versions = record["version"]
for version in versions:
for sub_version in version["record"]:
metadata = sub_version["metadata"]["dc"]
# Sometimes fulltext identifiers are only available on the
# version rather than the sub version. So we'll look in the
# sub version first, and if they're not there use the url from
# the version.
# Sometimes there are multiple fulltext urls associated with a version:
# eg a collection page and a publication. If so add records for both urls.
# They could end up pointing to the same digitised publication, but
# we can sort that out later. Aim here is to try and not miss any possible
# routes to digitised publications!
urls = get_fulltext_url(
metadata.get("identifier", [])
)
if len(urls) == 0:
urls = get_fulltext_url(
version.get("identifier", [])
)
# Sometimes there are fulltext links on work but not versions
if len(urls) == 0:
urls = get_fulltext_url(
record.get("identifier", [])
)
if len(urls) == 0 and filter_by == "nuc":
urls = [{"url": "", "link_text": ""}]
for url in urls:
work = {
# This is not the full set of available fields,
# adjust as necessary.
"title": get_value(metadata, "title"),
"work_url": record.get("troveUrl"),
"work_type": record.get("type", []),
"contributor": merge_values(
metadata,
["creator", "contributor"],
["value", "name"],
),
"publisher": get_value(
metadata, "publisher"
),
"date": merge_values(
metadata, ["date", "issued"]
),
# Using merge here because I've noticed some duplicate values
"type": merge_values(metadata, ["type"]),
"format": get_value(metadata, "format"),
"rights": merge_values(
metadata, ["rights", "licenseRef"]
),
"language": get_value(metadata, "language"),
"extent": get_value(metadata, "extent"),
"subject": merge_values(
metadata, ["subject"]
),
"spatial": get_value(metadata, "spatial"),
# Flattened type/value
"is_part_of": flatten_values(
metadata, "isPartOf"
),
# Only get control numbers and flatten
"identifier": flatten_identifiers(metadata),
"fulltext_url": url["url"],
"fulltext_url_text": url["link_text"],
"catalogue_url": get_catalogue_url(
metadata["identifier"]
),
# Could also add in data from bibliographicCitation
# Although the types used in citations seem to vary by work and format.
}
ndjson_file.write(f"{json.dumps(work)}\n")
# The nextStart parameter is used to get the next page of results.
# If there's no nextStart then it means we're on the last page of results.
try:
start = data["category"][0]["records"]["nextStart"]
except KeyError:
start = None
pbar.update(len(items))
Harvest periodical titles¶
The first step is to search for NED periodical titles and harvest all the version records.
params = {
"q": '"nla.obj" nuc:"ANL:NED"',
"l-format": "Periodical", # Journals only
"l-availability": "y",
}
harvest_works(params, output_file="ned-periodicals.ndjson")
Remove duplicates¶
Because we've unpacked the work records and saved individual versions, there are likely to be some duplicates. Here we'll merge the duplicates records.
def merge_column(columns):
values = []
for value in columns:
if isinstance(value, list):
values += [str(v) for v in value if v]
elif value:
values.append(str(value))
return " | ".join(sorted(set(values)))
def merge_records(df):
# df["pages"].fillna(0, inplace=True)
# df.fillna("", inplace=True)
# df["pages"] = df["pages"].astype("Int64")
# Add base dataset with columns that will always have only one value
dfs = [df[["fulltext_url"]].drop_duplicates()]
# Columns that potentially have multiple values which will be merged
columns = [
"title",
"work_url",
"work_type",
"contributor",
"publisher",
"date",
"type",
"format",
"extent",
"language",
"subject",
"spatial",
"is_part_of",
"identifier",
"rights",
"fulltext_url_text",
"catalogue_url",
]
# Merge values from each column in turn, creating a new dataframe from each
for column in columns:
dfs.append(
df.groupby(["fulltext_url"])[column].apply(merge_column).reset_index()
)
# Merge all the individual dataframes into one, linking on `text_file` value
df_merged = reduce(
lambda left, right: pd.merge(left, right, on=["fulltext_url"], how="left"), dfs
)
return df_merged
Load the harvested data.
df = pd.read_json("ned-periodicals.ndjson", lines=True)
How many records are there?
df.shape
(8849, 18)
Now we'll merge the duplicates.
df_merged = merge_records(df)
How many records are there now?
# How many journals are there?
df_merged.shape[0]
7973
Do some reorganisation of the dataset and save it as a CSV file.
def save_ned_titles(df, output="ned-periodicals.csv"):
df["id"] = df["fulltext_url"].apply(lambda x: x.strip("/").split("/")[-1])
df_titles = df[
[
"id",
"title",
"contributor",
"publisher",
"date",
"fulltext_url",
"work_url",
"work_type",
"type",
"format",
"extent",
"language",
"subject",
"spatial",
"is_part_of",
"identifier",
"rights",
"catalogue_url",
]
]
df_titles.to_csv(output, index=False)
return df_titles
df_titles = save_ned_titles(df_merged)
Get details of issues¶
def get_metadata(id):
"""
Extract work data in a JSON string from the work's HTML page.
"""
if not id.startswith("http"):
id = "https://nla.gov.au/" + id
response = s.get(id)
try:
work_data = re.search(
r"var work = JSON\.parse\(JSON\.stringify\((\{.*\})", response.text
).group(1)
except AttributeError:
work_data = "{}"
if not response.from_cache:
time.sleep(0.2)
return json.loads(work_data)
def get_iso_date(date):
if date:
iso_date = arrow.get(date, "ddd, D MMM YYYY").format("YYYY")
else:
iso_date = None
return iso_date
def get_issues(parent_id):
"""
Get the ids of issues that are children of the current record.
"""
start_url = "https://nla.gov.au/{}/browse?startIdx={}&rows=20&op=c"
# The initial startIdx value
start = 0
# Number of results per page
n = 20
parts = []
# If there aren't 20 results on the page then we've reached the end, so continue harvesting until that happens.
while n == 20:
# Get the browse page
response = s.get(start_url.format(parent_id, start))
# Beautifulsoup turns the HTML into an easily navigable structure
soup = BeautifulSoup(response.text, "lxml")
# Find all the divs containing issue details and loop through them
details = soup.find_all(class_="l-item-info")
for detail in details:
title = detail.find("h3")
if title:
issue_id = title.parent["href"].strip("/")
else:
issue_id = detail.find("a")["href"].strip("/")
# Get the issue id
parts.append(issue_id)
if not response.from_cache:
time.sleep(0.2)
# Increment the startIdx
start += n
# Set n to the number of results on the current page
n = len(details)
return parts
def harvest_all_issues(input="ned-periodicals.csv", output="ned-issues.ndjson"):
df = pd.read_csv(input)
with Path(output).open("w") as ndjson_file:
for title in tqdm(df.itertuples(), total=df.shape[0]):
title_id = title.fulltext_url.strip("/").split("/")[-1]
issues = get_issues(title_id)
for issue_id in issues:
metadata = get_metadata(issue_id)
try:
issue = {
"id": metadata["pid"],
"title_id": title_id,
"title": metadata["title"],
"description": metadata.get("subUnitNo", ""),
"date": get_iso_date(metadata.get("issueDate", None)),
"url": f"https://nla.gov.au/{metadata['pid']}",
"ebook_type": metadata.get("ebookType", ""),
"access_conditions": metadata.get("accessConditions", ""),
"copyright_policy": metadata.get("copyrightPolicy", ""),
}
except KeyError:
print(title_id)
else:
ndjson_file.write(f"{json.dumps(issue)}\n")
harvest_all_issues()
0%| | 0/7973 [00:00<?, ?it/s]
nla.obj-831175906 nla.obj-831175906 nla.obj-831175906 nla.obj-831175906 nla.obj-831175906 nla.obj-831175906 nla.obj-831175906 nla.obj-831175906 nla.obj-831175906 nla.obj-831175906 nla.obj-831175906 nla.obj-831175906 nla.obj-831175906 nla.obj-831175906 nla.obj-831175906 nla.obj-831175906
df_issues = pd.read_json(
"ned-issues.ndjson", convert_dates=False, dtype={"date": "Int64"}, lines=True
)
def add_download_link(row):
url = ""
if row["access_conditions"] == "Unrestricted":
url = f"https://nla.gov.au/{row['id']}/download?downloadOption=eBook&firstPage=-1&lastPage=-1"
return url
df_issues["download_link"] = df_issues.apply(add_download_link, axis=1)
df_issues.to_csv("ned-periodical-issues.csv", index=False)
df_totals = (
df_issues.loc[df_issues["access_conditions"] == "Unrestricted"]
.groupby(["title_id", "title"])
.size()
.to_frame()
.reset_index()
)
df_totals.sort_values(0, ascending=False)[:20]
title_id | title | 0 | |
---|---|---|---|
1758 | nla.obj-1916881555 | Western Australian government gazette. | 1869 |
4436 | nla.obj-2940864261 | The Australian Jewish News. | 1067 |
2619 | nla.obj-2692666983 | APSjobs-vacancies daily ... daily gazette. | 1043 |
4461 | nla.obj-2945379691 | Tweed link | 825 |
2221 | nla.obj-2541626239 | Weekly notice | 798 |
4435 | nla.obj-2940863963 | The Australian Jewish News. | 726 |
35 | nla.obj-1252109725 | Queensland Health services bulletin | 700 |
17 | nla.obj-1247944368 | Hyden Karlgarin Householder News. | 642 |
769 | nla.obj-1775015332 | E-record : your news from across the Archdioce... | 640 |
7194 | nla.obj-638303044 | Class ruling | 580 |
2211 | nla.obj-2536144595 | Plantagenet news. | 574 |
185 | nla.obj-1252305285 | Clermont rag : Community newspaper. | 514 |
3402 | nla.obj-2815835489 | The Apollo Bay news. | 513 |
1731 | nla.obj-1908935587 | Assessment reports and exam papers | 512 |
5656 | nla.obj-3125539859 | The Peninsula community access news. | 506 |
3956 | nla.obj-2859788676 | Council news : weekly information from us to you | 469 |
43 | nla.obj-1252119874 | Rot-Ayr-Ian [electronic resource] : the offici... | 467 |
4899 | nla.obj-2994765231 | Townsville Orchid Society Inc. bulletin. | 442 |
141 | nla.obj-1252246096 | Palm Island Voice. | 428 |
6264 | nla.obj-3267060622 | News & views from George Cochrane. | 399 |
df_issues["access_conditions"].value_counts()
access_conditions Unrestricted 138557 View Only 12937 Onsite Only 4657 Name: count, dtype: int64
df_issues["ebook_type"].value_counts()
ebook_type application/pdf 154976 1075 application/epub+zip 100 Name: count, dtype: int64
db = Database("ned-periodicals.db", recreate=True)
df_titles.insert(
0,
"thumbnail",
df_titles["fulltext_url"].apply(
lambda x: f'{{"img_src": "{x + "-t"}"}}' if not pd.isnull(x) else ""
),
)
db["titles"].insert_all(df_titles.to_dict(orient="records"), pk="id")
db["titles"].enable_fts(["title", "contributor", "publisher", "subject"])
df_issues.insert(
0,
"thumbnail",
df_issues["url"].apply(
lambda x: f'{{"img_src": "{x + "-t"}"}}' if not pd.isnull(x) else ""
),
)
df_issues = df_issues.drop("title", axis=1)
db["issues"].insert_all(df_issues.to_dict(orient="records"), pk="id")
db["issues"].add_foreign_key("title_id", "titles", "id")
<Table issues (thumbnail, id, title_id, description, date, url, ebook_type, access_conditions, copyright_policy, download_link)>
# IGNORE THIS CELL -- FOR TESTING ONLY
if os.getenv("GW_STATUS") == "dev":
df_test = pd.read_json("ned-periodicals.ndjson", lines=True)[:20]
df_merged_test = merge_records(df_test)
df_titles_test = save_ned_titles(df_merged_test, "ned-periodicals-test.csv")
harvest_all_issues(
input="ned-periodicals-test.csv", output="ned-periodicals-issues-test.ndjson"
)
Path("ned-periodicals-test.csv").unlink()
Path("ned-periodicals-issues-test.ndjson").unlink()
0%| | 0/18 [00:00<?, ?it/s]
(18, 19)
Created by Tim Sherratt for the GLAM Workbench.