Create large composite images from snipped words¶
This is a variation of the 'scissors & paste' notebook that extracts words from Trove newspaper images and compiles them into messages. In this notebook, you can harvest multiple versions of a list of words and compile them all into one big image.
# Import what we need
import os
import random
import shutil
import time
from datetime import datetime
from io import BytesIO
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from IPython.display import FileLink, display
from PIL import Image, ImageOps
from rectpack import SORT_NONE, newPacker
load_dotenv()
# Insert your Trove API key
API_KEY = "YOUR API KEY"
# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
API_KEY = os.getenv("TROVE_API_KEY")
HEADERS = {"X-API-KEY": API_KEY}
# List of words you want to harvest
WORD_LIST = [
"newspaper",
"book",
"magazine",
"journal",
"picture",
"data",
"music",
"map",
"people",
"discover",
"explore",
"web",
"research",
"create",
"article",
"history",
]
# Max number of images of each word you want to harvest (sometimes the words can't be found in the article, so the actual number will probably be a little less)
NUM_WORDS = 50
# Where to save the images
IMG_DIR = "words"
# Create the output directory
Path(IMG_DIR).mkdir(parents=True, exist_ok=True)
def get_word_boxes(article_url):
"""
Get the boxes around highlighted search terms.
"""
boxes = []
# Get the article page
response = requests.get(article_url)
# Load in BS4
soup = BeautifulSoup(response.text, "lxml")
# Get the id of the newspaper page
page_id = soup.select("div.zone.onPage")[0]["data-page-id"]
# Find the highlighted terms
words = soup.select("span.highlightedTerm")
# Save the box coords
for word in words:
box = {
"page_id": page_id,
"left": int(word["data-x"]),
"top": int(word["data-y"]),
"width": int(word["data-w"]),
"height": int(word["data-h"]),
}
boxes.append(box)
return boxes
def crop_word(box, kw, article_id):
"""
Crop the box coordinates from the full page image.
"""
word_path = Path(f"{IMG_DIR}/{kw}-{article_id}.jpg")
if not word_path.exists():
# Construct the url we need to download the page image
page_url = (
"https://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}".format(
box["page_id"], 7
)
)
# print(page_url)
# Download the page image
response = requests.get(page_url)
# Open download as an image for editing
img = Image.open(BytesIO(response.content))
word = img.crop(
(
box["left"] - 5,
box["top"] - 5,
box["left"] + box["width"] + 5,
box["top"] + box["height"] + 5,
)
)
img.close()
word.save(word_path)
def get_article_from_search(kw):
"""
Use the Trove API to find articles with the supplied keyword.
"""
params = {
"q": f'text:"{kw}"',
"category": "newspaper",
"l-artType": "newspaper",
"encoding": "json",
"n": NUM_WORDS,
}
response = requests.get(
"https://api.trove.nla.gov.au/v3/result", params=params, headers=HEADERS
)
data = response.json()
articles = data["category"][0]["records"]["article"]
for article in articles:
boxes = []
try:
boxes = get_word_boxes(article["troveUrl"])
except KeyError:
pass
if boxes:
crop_word(boxes[0], kw, article["id"])
time.sleep(1)
Get all the words¶
for word in WORD_LIST:
get_article_from_search(word)
Create the composite image¶
Here we use a packing algorithm to try and fit the little word images (which are a variety of shapes and sizes) into one big box with as few gaps as possible. Adjust the WIDTH
and HEIGHT
values below to change the size of the composite.
# Set width of composite image
WIDTH = 1000
# Set height of composite image
HEIGHT = 1000
# Set background colour of composite image
BG_COLOUR = (0, 0, 0)
def get_image_data():
images = []
for im in [i for i in Path(IMG_DIR).glob("*.jpg")]:
img = Image.open(im)
h, w = img.size
images.append((h + 4, w + 4, im.name))
return images
def pack_images():
images = get_image_data()
random.shuffle(images)
packer = newPacker(sort_algo=SORT_NONE, rotation=False)
for i in images:
packer.add_rect(*i)
packer.add_bin(WIDTH, HEIGHT)
packer.pack()
return len(images), packer.rect_list()
def create_composite(output_file=None):
num_images, rectangles = pack_images()
comp = Image.new("RGB", (WIDTH, HEIGHT), BG_COLOUR)
for rect in rectangles:
b, x, y, w, h, rid = rect
# print(x,y, w, h, rid)
word_path = Path(IMG_DIR, rid)
word = Image.open(word_path)
word = word.convert("RGB")
word_with_border = ImageOps.expand(word, border=2, fill=BG_COLOUR)
comp.paste(word_with_border, (x, y, x + w, y + h))
if not output_file:
output_file = (
f"trove-words-{int(datetime.now().timestamp())}-{WIDTH}-{HEIGHT}.jpg"
)
comp.save(output_file)
print(f"{len(rectangles)} of {num_images} images used")
display(FileLink(output_file))
Run the cell below to create a composite image of the words you've harvested. The function will tell you how many of the harvested words it was able to fit into the composite. You can adjust the width and height of the composite to fit in more, or fill up gaps.
create_composite()
# FOR TESTING ONLY -- IGNORE THIS CELL
if os.getenv("GW_STATUS") == "dev":
IMG_DIR = "test"
NUM_WORDS = 2
Path(IMG_DIR).mkdir(parents=True, exist_ok=True)
WORD_LIST = ["annoyed", "angry"]
for word in WORD_LIST:
get_article_from_search(word)
create_composite()
shutil.rmtree(Path(IMG_DIR))
Created by Tim Sherratt for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.