# Import what we need
import os
import random
import shutil
import time
from datetime import datetime
from io import BytesIO
from pathlib import Path

import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from IPython.display import FileLink, display
from PIL import Image, ImageOps
from rectpack import SORT_NONE, newPacker

load_dotenv()

# Insert your Trove API key
API_KEY = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")

HEADERS = {"X-API-KEY": API_KEY}

# List of words you want to harvest
WORD_LIST = [
    "newspaper",
    "book",
    "magazine",
    "journal",
    "picture",
    "data",
    "music",
    "map",
    "people",
    "discover",
    "explore",
    "web",
    "research",
    "create",
    "article",
    "history",
]

# Max number of images of each word you want to harvest (sometimes the words can't be found in the article, so the actual number will probably be a little less)
NUM_WORDS = 50

# Where to save the images
IMG_DIR = "words"

# Create the output directory
Path(IMG_DIR).mkdir(parents=True, exist_ok=True)

def get_word_boxes(article_url):
    """
    Get the boxes around highlighted search terms.
    """
    boxes = []
    # Get the article page
    response = requests.get(article_url)
    # Load in BS4
    soup = BeautifulSoup(response.text, "lxml")
    # Get the id of the newspaper page
    page_id = soup.select("div.zone.onPage")[0]["data-page-id"]
    # Find the highlighted terms
    words = soup.select("span.highlightedTerm")
    # Save the box coords
    for word in words:
        box = {
            "page_id": page_id,
            "left": int(word["data-x"]),
            "top": int(word["data-y"]),
            "width": int(word["data-w"]),
            "height": int(word["data-h"]),
        }
        boxes.append(box)
    return boxes


def crop_word(box, kw, article_id):
    """
    Crop the box coordinates from the full page image.
    """
    word_path = Path(f"{IMG_DIR}/{kw}-{article_id}.jpg")
    if not word_path.exists():
        # Construct the url we need to download the page image
        page_url = (
            "https://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}".format(
                box["page_id"], 7
            )
        )
        # print(page_url)
        # Download the page image
        response = requests.get(page_url)
        # Open download as an image for editing
        img = Image.open(BytesIO(response.content))
        word = img.crop(
            (
                box["left"] - 5,
                box["top"] - 5,
                box["left"] + box["width"] + 5,
                box["top"] + box["height"] + 5,
            )
        )
        img.close()
        word.save(word_path)


def get_article_from_search(kw):
    """
    Use the Trove API to find articles with the supplied keyword.
    """
    params = {
        "q": f'text:"{kw}"',
        "category": "newspaper",
        "l-artType": "newspaper",
        "encoding": "json",
        "n": NUM_WORDS,
    }
    response = requests.get(
        "https://api.trove.nla.gov.au/v3/result", params=params, headers=HEADERS
    )
    data = response.json()
    articles = data["category"][0]["records"]["article"]
    for article in articles:
        boxes = []
        try:
            boxes = get_word_boxes(article["troveUrl"])
        except KeyError:
            pass
        if boxes:
            crop_word(boxes[0], kw, article["id"])
        time.sleep(1)

for word in WORD_LIST:
    get_article_from_search(word)

# Set width of composite image
WIDTH = 1000

# Set height of composite image
HEIGHT = 1000

# Set background colour of composite image
BG_COLOUR = (0, 0, 0)

def get_image_data():
    images = []
    for im in [i for i in Path(IMG_DIR).glob("*.jpg")]:
        img = Image.open(im)
        h, w = img.size
        images.append((h + 4, w + 4, im.name))
    return images


def pack_images():
    images = get_image_data()
    random.shuffle(images)
    packer = newPacker(sort_algo=SORT_NONE, rotation=False)
    for i in images:
        packer.add_rect(*i)
    packer.add_bin(WIDTH, HEIGHT)
    packer.pack()
    return len(images), packer.rect_list()


def create_composite(output_file=None):
    num_images, rectangles = pack_images()
    comp = Image.new("RGB", (WIDTH, HEIGHT), BG_COLOUR)
    for rect in rectangles:
        b, x, y, w, h, rid = rect
        # print(x,y, w, h, rid)
        word_path = Path(IMG_DIR, rid)
        word = Image.open(word_path)
        word = word.convert("RGB")
        word_with_border = ImageOps.expand(word, border=2, fill=BG_COLOUR)
        comp.paste(word_with_border, (x, y, x + w, y + h))
    if not output_file:
        output_file = (
            f"trove-words-{int(datetime.now().timestamp())}-{WIDTH}-{HEIGHT}.jpg"
        )
    comp.save(output_file)
    print(f"{len(rectangles)} of {num_images} images used")
    display(FileLink(output_file))

create_composite()

# FOR TESTING ONLY -- IGNORE THIS CELL
if os.getenv("GW_STATUS") == "dev":
    IMG_DIR = "test"
    NUM_WORDS = 2
    Path(IMG_DIR).mkdir(parents=True, exist_ok=True)
    WORD_LIST = ["annoyed", "angry"]
    for word in WORD_LIST:
        get_article_from_search(word)
    create_composite()
    shutil.rmtree(Path(IMG_DIR))

Create large composite images from snipped words¶

Get all the words¶

Create the composite image¶