Make composite images from lots of Trove newspaper thumbnails¶
This notebook starts with a search in Trove's newspapers. It uses the Trove API to work it's way through the search results. For each article it creates a thumbnail image using the code from this notebook. Once this first stage is finished, you have a directory full of lots of thumbnails.
The next stage takes all those thumbnails and pastes them one by one into a BIG image to create a composite, or mosaic.
You'll need to think carefully about the number of results in your search, and the size of the image you want to create. Harvesting all the thumbnails can take a long time.
Also, you need to be able to set a path to a font file, so it's probably best to run this notebook on your local machine rather than in a cloud service, so you have more control over things like font. You might also need to adjust the font size depending on the font you choose.
Some examples:
import os
import re
from io import BytesIO
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from PIL import Image, ImageDraw, ImageFont
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))
Path("thumbs").mkdir(exist_ok=True)
load_dotenv()
Set your parameters¶
Edit the values below as required.
font_path = "/Library/Fonts/Courier New.ttf"
font_path = "/usr/share/fonts/truetype/freefont/FreeMono.ttf"
font_size = 12
# Insert your search query below
query = 'title:"white australia policy" date:[1960 TO 1969]'
size = 200 # Size of the thumbnails
cols = 90 # The width of the final image will be cols x size
rows = 55 # The height of the final image will be cols x size
# Insert your Trove API key
api_key = "YOUR API KEY"
# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
api_key = os.getenv("TROVE_API_KEY")
headers = {"X-API-KEY": api_key}
Define some functions¶
def get_article_top(article_url):
"""
Positional information about the article is attached to each line of the OCR output in data attributes.
This function loads the HTML version of the article and scrapes the x, y, and width values for the
top line of text (ie the top of the article).
"""
response = requests.get(article_url)
soup = BeautifulSoup(response.text, "lxml")
# Lines of OCR are in divs with the class 'zone'
# 'onPage' limits to those on the current page
zones = soup.select("div.zone.onPage")
# Start with the first element, but...
top_element = zones[0]
top_y = int(top_element["data-y"])
# Illustrations might come after text even if they're above them on the page
# So loop through the zones to find the element with the lowest 'y' attribute
for zone in zones:
if int(zone["data-y"]) < top_y:
top_y = int(zone["data-y"])
top_element = zone
top_x = int(top_element["data-x"])
top_w = int(top_element["data-w"])
return {"x": top_x, "y": top_y, "w": top_w}
def get_thumbnail(article, size, font_path, font_size):
buffer = 0
try:
page_id = re.search(r"news-page(\d+)", article["trovePageUrl"]).group(1)
except (AttributeError, KeyError):
thumb = None
else:
# Get position of top line of article
article_top = get_article_top(article["troveUrl"])
# Construct the url we need to download the image
page_url = (
"https://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}".format(
page_id, 7
)
)
# Download the page image
response = s.get(page_url, timeout=120)
# Open download as an image for editing
img = Image.open(BytesIO(response.content))
# Use coordinates of top line to create a square box to crop thumbnail
box = (
article_top["x"] - buffer,
article_top["y"] - buffer,
article_top["x"] + article_top["w"] + buffer,
article_top["y"] + article_top["w"] + buffer,
)
try:
# Crop image to create thumb
thumb = img.crop(box)
except OSError:
thumb = None
else:
# Resize thumb
thumb.thumbnail((size, size), Image.ANTIALIAS)
article_id = "nla.news-article{}".format(article["id"])
fnt = ImageFont.truetype(font_path, 12)
draw = ImageDraw.Draw(thumb)
try:
# Check if RGB
draw.rectangle(
[(0, size - 12), (size, size)], fill=(255, 255, 255, 255)
)
draw.text((0, size - 12), article_id, font=fnt, fill=(0, 0, 0, 255))
except TypeError:
# Must be grayscale
draw.rectangle([(0, size - 12), (200, 200)], fill=(255))
draw.text((0, size - 12), article_id, font=fnt, fill=(0))
return thumb
def get_total_results(params):
"""
Get the total number of results for a search.
"""
these_params = params.copy()
these_params["n"] = 0
response = s.get(
"https://api.trove.nla.gov.au/v3/result",
params=these_params,
headers=headers,
timeout=60,
)
# print(response.url)
data = response.json()
return int(data["category"][0]["records"]["total"])
def get_thumbnails(query, size, font_path, font_size):
# im = Image.new('RGB', (cols*size, rows*size))
params = {
"q": query,
"category": "newspaper",
"l-artType": "newspaper",
"encoding": "json",
"bulkHarvest": "true",
"n": 100,
"reclevel": "full",
}
start = "*"
total = get_total_results(params)
with tqdm(total=total) as pbar:
while start:
params["s"] = start
response = s.get(
"https://api.trove.nla.gov.au/v3/result",
params=params,
headers=headers,
timeout=60,
)
data = response.json()
# The nextStart parameter is used to get the next page of results.
# If there's no nextStart then it means we're on the last page of results.
try:
start = data["category"][0]["records"]["nextStart"]
except KeyError:
start = None
for article in data["category"][0]["records"]["article"]:
thumb_file = "thumbs/{}-nla.news-article{}.jpg".format(
article["date"], article["id"]
)
if not os.path.exists(thumb_file):
thumb = get_thumbnail(article, size, font_path, font_size)
if thumb:
thumb.save(thumb_file)
pbar.update(1)
def create_composite(cols, rows, size):
im = Image.new("RGB", (cols * size, rows * size))
thumbs = [t for t in os.listdir("thumbs") if t[-4:] == ".jpg"]
# This will sort by date, comment it out if you don't want that
# thumbs = sorted(thumbs)
x = 0
y = 0
for index, thumb_file in tqdm(enumerate(thumbs, 1)):
thumb = Image.open("thumbs/{}".format(thumb_file))
try:
im.paste(thumb, (x, y, x + size, y + size))
except ValueError:
pass
else:
if (index % cols) == 0:
x = 0
y += size
else:
x += size
im.save("composite-{}-{}.jpg".format(cols, rows), quality=90)
Create all the thumbnails¶
get_thumbnails(query, size, font_path, font_size)
Turn the thumbnails into one big image¶
create_composite(cols, rows, size)
Created by Tim Sherratt for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.