Harvesting Australian Women's Weekly covers¶
(or all the front pages of any digitised newspaper)¶
Somewhat confusingly, the Australian Women's Weekly is in with Trove's digitised newspapers and not the rest of the magazines. There are notebooks in the GLAM Workbench's journals section to help harvest all of a journal's covers as images, so I thought I should do the same for the Weekly.
Just change the TITLE_ID
, START_DATE
, END_DATE
, and PREFIX
, to harvest all the front pages of any digitised newspaper.
Harvest summary¶
- The list of issues harvested is available in this CSV.
- 2,566 images were downloaded.
- For easy browsing, I've compiled the images into a set of PDF files, one for each decade, available from Dropbox:
Import what we need¶
In [ ]:
import os
import re
import shutil
import time
from pathlib import Path
import pandas as pd
import requests_cache
from dotenv import load_dotenv
from IPython.display import FileLink, display
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
s = requests_cache.CachedSession("front_pages")
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))
load_dotenv()
Set some options¶
Modify the values below as required.
In [ ]:
# Insert your Trove API key
API_KEY = "YOUR API KEY"
# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
API_KEY = os.getenv("TROVE_API_KEY")
# The id of the newspaper you want to harvest
TITLE_ID = "112" # 112 is the AWW
# Range of years to harvest
START_YEAR = 1933
END_YEAR = 1983
# A prefix to use in file names, if None then the title_id will be used
PREFIX = "aww"
Define some functions¶
In [ ]:
TITLE_URL = f"https://api.trove.nla.gov.au/v3/newspaper/title/{TITLE_ID}"
def get_current_year(years, year):
"""
Get data for the current year from the dictionary of years.
"""
for year_data in years:
if year_data["date"] == str(year):
return year_data
def get_issues():
"""
Get all the issue details by looping through the range of years.
Returns a list of issues.
"""
params = {"encoding": "json", "include": "years"}
headers = {"X-API-KEY": API_KEY}
issues = []
for year in tqdm(range(START_YEAR, END_YEAR), desc="Issues"):
# Setting 'range' tells the API to give us a list of issue dates & urls within that date range
date_range = f"{year}0101-{year}1231"
params["range"] = date_range
# Get the data
response = s.get(TITLE_URL, params=params, headers=headers)
data = response.json()
# Extract the details for the current year
year_data = get_current_year(data["year"], year)
# Save issue details
for issue in year_data["issue"]:
issues.append(issue)
return issues
def get_file_prefix():
"""
Set the prefix to be used in filenames and data directory.
Defaults to title id if prefix is not set
"""
if PREFIX:
file_prefix = PREFIX
else:
file_prefix = TITLE_ID
return file_prefix
def create_output_dir(file_prefix):
"""
Create output directory.
"""
dir_path = Path("data", file_prefix)
dir_path.mkdir(parents=True, exist_ok=True)
return dir_path
def download_page(page_id, size, file_path):
"""
Download page image using the supplied id.
Size range is 1 to 7 (7 being the highest res)
"""
# Format the page url ising the page id
page_url = (
f"http://trove.nla.gov.au/ndp/imageservice/nla.news-page{page_id}/level{size}"
)
# Download the image
response = s.get(page_url)
file_path.write_bytes(response.content)
time.sleep(0.5)
def harvest_covers(size=5, sample_size=None):
"""
Get a list of issues of the title.
Loop through the issues downloading each front page/cover.
Return issue metadata.
"""
# Get a list of issues
issues = get_issues()
# Loop through the issues
for issue in tqdm(issues[:sample_size], desc="Pages"):
# Request the issue url
response = s.get(issue["url"])
# The issue url will be redirected to a page url
# Extract the page id from the page url
page_id = re.search(r"(\d+)$", response.url).group(1)
# Save page id to metadata
issue["page_id"] = page_id
# Set up dirs and files
file_prefix = get_file_prefix()
dir_path = create_output_dir(file_prefix)
file_path = Path(
dir_path,
f'{file_prefix}-{issue["date"].replace("-", "")}-page{page_id}.jpg',
)
# If the image hasn't already been downloaded, then download it!
if not file_path.exists():
download_page(page_id, size, file_path)
# Save the image name to the metadata
issue["image_name"] = file_path.name
return issues
Run the harvest!¶
In [ ]:
issues = harvest_covers()
Save the metadata¶
In [ ]:
df = pd.DataFrame(issues)
df.rename(columns={"id": "issue_id"}, inplace=True)
df.head()
In [ ]:
file_prefix = get_file_prefix()
df.to_csv(f"data/{file_prefix}-issues.csv", index=False)
display(FileLink(f"data/{file_prefix}-issues.csv"))
In [ ]:
# FOR TESTING ONLY -- IGNORE THIS CELL
if os.getenv("GW_STATUS") == "dev":
PREFIX = "test"
issues = harvest_covers(sample_size=5)
assert len(list(Path("data", "test").glob("*.jpg"))) == 5
shutil.rmtree(Path("data", "test"))
Created by Tim Sherratt for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.