Harvest the issues of a newspaper as PDFs¶
This notebook harvests issues of a newspaper as PDFs – one PDF per issue. If the newspaper has an long print run, this will consume large amounts of time and disk space, so you might want to limit your harvest by date range.
The downloaded PDFs are saved in the data/issues
folder. The PDF file names have the following structure:
[newspaper identifier]-[issue date as YYYYMMDD]-[issue identifier].pdf
For example:
903-19320528-1791051.pdf
903
– the Glen Innes Examiner19320528
– 28 May 19321791051
– you view in Trove just add this tohttp://nla.gov.au/nla.news-issue
, eg http://nla.gov.au/nla.news-issue1791051
Set up what we need¶
Make sure you paste in your Trove API key where indicated.
import json
import os
import time
from pathlib import Path
import arrow
import pandas as pd
import requests
from dotenv import load_dotenv
from requests.adapters import HTTPAdapter
from requests.exceptions import HTTPError
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))
load_dotenv()
# Insert your Trove API key
API_KEY = "YOUR API KEY"
# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
API_KEY = os.getenv("TROVE_API_KEY")
API_URL = "https://api.trove.nla.gov.au/v3/newspaper/title/"
PARAMS = {"encoding": "json"}
HEADERS = {"X-API-KEY": API_KEY}
Get information about available issues¶
Before we start downloading huge numbers of PDFs, let's have a look at how many issues are available for the newspaper we're interested in. This code comes from harvest_newspaper_issues.ipynb.
# THIS CODE COMES FROM harvest_newspaper_issues.ipynb
# These are newspapers where the date ranges are off by more than a year
# In these cases we'll harvest all the issues in one hit, rather than year by year
dodgy_dates = ["1486", "1618", "586"]
def get_title_summary(title_id):
"""
Get the details of a single newspaper title.
"""
response = s.get(f"{API_URL}{title_id}", params=PARAMS, headers=HEADERS)
data = response.json()
return data
def get_issues_in_range(title_id, start_date, end_date):
"""
Get a list of issues available from a particular newspaper within the given date range.
"""
issues = []
params = PARAMS.copy()
params["include"] = "years"
params["range"] = f'{start_date.format("YYYYMMDD")}-{end_date.format("YYYYMMDD")}'
response = s.get(f"{API_URL}{title_id}", params=params, headers=HEADERS)
try:
data = response.json()
except json.JSONDecodeError:
print(response.url)
print(response.text)
else:
for year in data["year"]:
if "issue" in year:
for issue in year["issue"]:
issues.append(
{
"title_id": title_id,
"issue_id": issue["id"],
"issue_date": issue["date"],
}
)
return issues
def get_issues_full_range(title_id):
"""
In most cases we set date ranges to get issue data in friendly chunks. But sometimes the date ranges are missing or wrong.
In these cases, we ask for everything at once, by setting the range to the limits of Trove.
"""
start_date = arrow.get("1803-01-01")
range_end = arrow.now()
issues = get_issues_in_range(title_id, start_date, range_end)
return issues
def get_issues_from_title(title_id):
"""
Get a list of all the issues available for a particular newspaper.
Params:
* title_id - a newspaper identifier
Returns:
* A list containing details of available issues
"""
issues = []
title_summary = get_title_summary(title_id)
# Date range is off by more than a year, so get everything in one hit
if title_id in dodgy_dates:
issues += get_issues_full_range(title_id)
else:
try:
# The date ranges are not always reliable, so to make sure we get everything
# we'll set the range to the beginning and end of the given year
start_date = arrow.get(title_summary["startDate"]).replace(day=1, month=1)
end_date = arrow.get(title_summary["endDate"]).replace(day=31, month=12)
except KeyError:
# Some records have no start and end dates at all
# In this case set the range to the full range of Trove's newspapers
issues += get_issues_full_range(title_id)
else:
# If the date range is available, loop through it by year
while start_date <= end_date:
range_end = start_date.replace(month=12, day=31)
issues += get_issues_in_range(title_id, start_date, range_end)
start_date = start_date.shift(years=+1).replace(month=1, day=1)
return issues
Harvest the issue data.
# Set the id of the newspaper you want to harvest from
# You can get the newspaper id from the title details page in Trove
trove_newspaper_id = 1646
# Harvest the issue data
issues = get_issues_from_title(trove_newspaper_id)
Convert to a dataframe for analysis.
df = pd.DataFrame(issues)
df.head()
How many issues are available?
df.shape[0]
What is the date range of the issues?
df["issue_date"].min()
df["issue_date"].max()
Harvest the issues as PDFs¶
Now we have the issues data, we can use it to download the PDFs.
# THIS CODE IS A SLIGHTLY MODIFIED VERSION OF WHAT'S IN THE TROVE NEWSPAPER HARVESTER
def ping_pdf(ping_url):
"""
Check to see if a PDF is ready for download.
If a 200 status code is received, return True.
"""
ready = False
# req = Request(ping_url)
try:
response = s.get(ping_url, timeout=30)
response.raise_for_status()
except HTTPError:
if response.status_code == 423:
ready = False
else:
raise
else:
ready = True
return ready
def get_pdf_url(issue_id):
"""
Download the PDF version of an article.
These can take a while to generate, so we need to ping the server to see if it's ready before we download.
"""
pdf_url = None
# Ask for the PDF to be created
prep_url = (
f"https://trove.nla.gov.au/newspaper/rendition/nla.news-issue{issue_id}/prep"
)
response = s.get(prep_url)
# Get the hash
prep_id = response.text
# Url to check if the PDF is ready
ping_url = f"https://trove.nla.gov.au/newspaper/rendition/nla.news-issue{issue_id}.ping?followup={prep_id}"
tries = 0
ready = False
time.sleep(2) # Give some time to generate pdf
# Are you ready yet?
while ready is False and tries < 5:
ready = ping_pdf(ping_url)
if not ready:
tries += 1
time.sleep(2)
# Download if ready
if ready:
pdf_url = f"https://trove.nla.gov.au/newspaper/rendition/nla.news-issue{issue_id}.pdf?followup={prep_id}"
return pdf_url
def harvest_pdfs(issues, start_date=None, end_date=None):
"""
Download all issue pdfs within the given date range.
"""
output_path = Path("data", "issues")
output_path.mkdir(parents=True, exist_ok=True)
df = pd.DataFrame(issues)
if start_date and end_date:
df_range = df.loc[
(df["issue_date"] >= start_date) & (df["issue_date"] <= end_date)
]
elif start_date:
df_range = df.loc[(df["issue_date"] >= start_date)]
elif end_date:
df_range = df.loc[(df["issue_date"] < end_date)]
else:
df_range = df
for issue in tqdm(df_range.itertuples(), total=df_range.shape[0]):
pdf_url = get_pdf_url(issue.issue_id)
response = s.get(pdf_url)
Path(
output_path,
f'{issue.title_id}-{issue.issue_date.replace("-", "")}-{issue.issue_id}.pdf',
).write_bytes(response.content)
In the cell below you can set a date range for your harvest. Adjust the start and end dates as required. If you want to harvest ALL the issues, set the start and end dates to None
.
# Set start and end dates - YYYY-MM-DD, eg:
# start_date = '1932-05-01'
# Adjust these to suit your case, set to None to get everything
start_date = None
end_date = None
# Start harvesting the PDFs!
harvest_pdfs(issues, start_date=start_date, end_date=end_date)
Created by Tim Sherratt for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.