import re
from io import BytesIO

import requests
from bs4 import BeautifulSoup
from IPython.display import HTML, display
from PIL import Image

def get_box(zones):
    """
    Loop through all the zones to find the outer limits of each boundary.
    Return a bounding box around the article.
    """
    left = 10000
    right = 0
    top = 10000
    bottom = 0
    page_id = zones[0]["data-page-id"]
    for zone in zones:
        if int(zone["data-y"]) < top:
            top = int(zone["data-y"])
        if int(zone["data-x"]) < left:
            left = int(zone["data-x"])
        if (int(zone["data-x"]) + int(zone["data-w"])) > right:
            right = int(zone["data-x"]) + int(zone["data-w"])
        if (int(zone["data-y"]) + int(zone["data-h"])) > bottom:
            bottom = int(zone["data-y"]) + int(zone["data-h"])
    return {
        "page_id": page_id,
        "left": left,
        "top": top,
        "right": right,
        "bottom": bottom,
    }


def get_article_boxes(article_url):
    """
    Positional information about the article is attached to each line of the OCR output in data attributes.
    This function loads the HTML version of the article and scrapes the x, y, and width values for each line of text
    to determine the coordinates of a box around the article.
    """
    boxes = []
    response = requests.get(article_url)
    soup = BeautifulSoup(response.text, "lxml")
    # Lines of OCR are in divs with the class 'zone'
    # 'onPage' limits to those on the current page
    zones = soup.select("div.zone.onPage")
    boxes.append(get_box(zones))
    off_page_zones = soup.select("div.zone.offPage")
    if off_page_zones:
        current_page = off_page_zones[0]["data-page-id"]
        zones = []
        for zone in off_page_zones:
            if zone["data-page-id"] == current_page:
                zones.append(zone)
            else:
                boxes.append(get_box(zones))
                zones = [zone]
                current_page = zone["data-page-id"]
        boxes.append(get_box(zones))
    return boxes


def get_page_images(article_id, size):
    """
    Extract an image of the article from the page image(s), save it, and return the filename(s).
    """
    images = []
    # Get position of article on the page(s)
    boxes = get_article_boxes("http://nla.gov.au/nla.news-article{}".format(article_id))
    for box in boxes:
        # print(box)
        # Construct the url we need to download the page image
        page_url = (
            "https://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}".format(
                box["page_id"], 7
            )
        )
        # Download the page image
        response = requests.get(page_url)
        # Open download as an image for editing
        img = Image.open(BytesIO(response.content))
        # Use coordinates of top line to create a square box to crop thumbnail
        points = (box["left"], box["top"], box["right"], box["bottom"])
        # Crop image to article box
        cropped = img.crop(points)
        # Resize if necessary
        if size:
            cropped.thumbnail((size, size), Image.ANTIALIAS)
        # Save and display thumbnail
        cropped_file = "nla.news-article{}-{}.jpg".format(article_id, box["page_id"])
        cropped.save(cropped_file)
        images.append(cropped_file)
    return images


def get_article(article_url, size):
    # Get the article record from the API
    article_id = re.search(r"article\/{0,1}(\d+)", article_url).group(1)
    # print(article_id)
    images = get_page_images(article_id, size)
    for image in images:
        display(HTML(f'<a href="{image}" download>Download {image}</a>'))
        display(HTML('<img src="{}">'.format(image)))

# Copy the url of the article you want and paste it between the quotes
article_url = "https://trove.nla.gov.au/newspaper/article/107024751?searchTerm=wragge"

# Set this if you want to limit the size of the image.
# Leave as None if you want them at full size
max_size = None

get_article(article_url, max_size)

Save a Trove newspaper article as an image¶

How to use this notebook¶

Running live on Binder¶

Setting your options¶

Get your images!¶

Load all the things we need¶

Set your options¶

Get the images!¶