Get the page coordinates of a digitised newspaper article from TroveΒΆ
InΒ [2]:
from io import BytesIO
import requests
from bs4 import BeautifulSoup
from IPython.display import Image as DisplayImage
from IPython.display import display
from PIL import Image, ImageDraw
InΒ [3]:
def get_box(zones):
"""
Loop through all the zones to find the outer limits of each boundary.
Return a bounding box around the article.
"""
left = 10000
right = 0
top = 10000
bottom = 0
page_id = zones[0]["data-page-id"]
for zone in zones:
if int(zone["data-y"]) < top:
top = int(zone["data-y"])
if int(zone["data-x"]) < left:
left = int(zone["data-x"])
if (int(zone["data-x"]) + int(zone["data-w"])) > right:
right = int(zone["data-x"]) + int(zone["data-w"])
if (int(zone["data-y"]) + int(zone["data-h"])) > bottom:
bottom = int(zone["data-y"]) + int(zone["data-h"])
return {
"page_id": page_id,
"left": left,
"top": top,
"right": right,
"bottom": bottom,
}
def get_article_boxes(article_url):
"""
Positional information about the article is attached to each block of the OCR output in data attributes.
This function loads the HTML version of the article and scrapes the x, y, and width values for each block of text
to determine the coordinates of a box around the article.
"""
boxes = []
# Get the article page
response = requests.get(article_url)
# Load in BS4
soup = BeautifulSoup(response.text, "lxml")
# Lines of OCR are in divs with the class 'zone'
# 'onPage' limits to those on the current page
zones = soup.select("div.zone.onPage")
boxes.append(get_box(zones))
off_page_zones = soup.select("div.zone.offPage")
if off_page_zones:
current_page = off_page_zones[0]["data-page-id"]
zones = []
for zone in off_page_zones:
if zone["data-page-id"] == current_page:
zones.append(zone)
else:
boxes.append(get_box(zones))
zones = [zone]
current_page = zone["data-page-id"]
boxes.append(get_box(zones))
return boxes
def display_boxes(boxes):
for box in boxes:
# Construct the url we need to download the page image
page_url = (
"https://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}".format(
box["page_id"], 7
)
)
# Download the page image
response = requests.get(page_url)
# Open download as an image for editing
img = Image.open(BytesIO(response.content))
img = img.convert(mode="RGB")
draw = ImageDraw.Draw(img)
# Draw a rectangle on the image
draw.rectangle(
[(box["left"], box["top"]), (box["right"], box["bottom"])],
outline=(0, 255, 0),
width=20,
)
buffer = BytesIO()
img.save(buffer, format="JPEG")
display(DisplayImage(data=buffer.getvalue(), width=400))
InΒ [4]:
boxes = get_article_boxes("https://trove.nla.gov.au/newspaper/article/20858554")
print(boxes)
[{'page_id': '2517226', 'left': 3472, 'top': 2770, 'right': 4394, 'bottom': 6544}, {'page_id': '2517227', 'left': 206, 'top': 320, 'right': 3054, 'bottom': 6723}]
InΒ [5]:
display_boxes(boxes)
What can I do with this?ΒΆ
In the GLAM Workbench there's a notebook (and app) to save an article as an image using the code above. But what about building something like this into a pipeline to assemble a dataset of images? Perhaps illustrated advertisements by decade, or by product type, of from the Australian Women's Weekly? A collection of weather maps?
Created by Tim Sherratt for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.