import itertools
import json
import os
import re
import statistics
import time
from datetime import timedelta
from io import BytesIO
from pathlib import Path
from statistics import StatisticsError

import requests
import requests_cache
from dotenv import load_dotenv
from IPython.display import Image as Display_Image
from IPython.display import display
from PIL import Image
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from slugify import slugify
from tqdm.auto import tqdm

s = requests_cache.CachedSession(expire_after=timedelta(days=30))
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

load_dotenv()

True

# Insert your Trove API key
API_KEY = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")

ILL_TYPES = {
    "Illustrated": ["Illustration"],
    "Photo": ["Content_Photo", "Photo"],
    "Map": ["Content_Map", "Map"],
    "Graph": ["Content_Graph", "Graph"],
    "Cartoon": ["Content_Cartoon", "Cartoon"],
    "Music": ["Content_Music", "Music"],
}

ILL_TYPE_DEFAULTS = list(itertools.chain(ILL_TYPES.values()))


def get_metadata(id):
    """
    Extract work data in a JSON string from the work's HTML page.
    """
    if not id.startswith("http"):
        id = "https://nla.gov.au/" + id
    response = s.get(id)
    try:
        work_data = re.search(
            r"var work = JSON\.parse\(JSON\.stringify\((\{.*\})", response.text
        ).group(1)
    except AttributeError:
        work_data = "{}"
    if not response.from_cache:
        time.sleep(0.2)
    return json.loads(work_data)


def get_ill_coords(data, ill_types=ILL_TYPE_DEFAULTS):
    """
    Extract coordinates of all illustrations of the specified types from OCR data.
    """
    ill_coords = []
    # Structure of the OCR JSON varies,sometimes the top level is an object,
    # sometimes an array.
    # Try to get the `zs` block
    try:
        ill_blocks = data["print"].get("zs", [])
    # There's no `zs` key so this is probably an array
    except AttributeError:
        ill_blocks = []
        # Loop through all the blocks, keeping the `zs` labelled ones
        for section in data["print"]:
            for label, blocks in section.items():
                if label == "zs":
                    ill_blocks += blocks
    # Loop through all the saved blocks
    for block in ill_blocks:
        # Check if the illustration type is in the desired list
        if block.get("t") in ill_types:
            # Get the coordinates
            left, top, right, bottom = block["b"].split(",")
            # Add to the lisst of coords, converting strings to ints
            ill_coords.append([int(left), int(top), int(right), int(bottom)])
    return ill_coords


def get_page_size(page_id):
    """
    Get the dimensions of the current page image from embedded metadata.
    """
    metadata = get_metadata(page_id)
    for page in metadata["children"]["page"]:
        if page["pid"] == page_id:
            break
    return page["copies"][0]["technicalmetadata"]


def get_caption(data, coords, lines=5):
    """
    Adds extra space at the bottom of an image to try and include the caption.
    """
    left, top, right, bottom = coords
    heights = []
    # Get all the blocks containing text
    try:
        text_blocks = data["print"].get("ps", [])
    except AttributeError:
        text_blocks = []
        for section in data["print"]:
            for label, blocks in section.items():
                if label == "ps":
                    text_blocks += blocks
    # Loop through all text blocks and lines, saving the line heights
    for block in text_blocks:
        for line in block.get("ls", []):
            ll, lt, lr, lb = line["b"].split(",")
            heights.append(int(lb) - int(lt))

    try:
        # Return the average line height x desired number of lines
        return bottom + (statistics.median(heights) * lines)
    except StatisticsError:
        return bottom


def save_illustrations_from_page(
    page_id,
    page_size=None,
    output_path=None,
    ill_types=ILL_TYPE_DEFAULTS,
    caption_lines=0,
):
    """
    Save all illustrations of the specified type from the supplied page.
    """
    saved_images = []
    # Prepare output path where the images will be saved
    if not output_path:
        output_path = Path("illustrations", "pages", page_id)
        output_path.mkdir(exist_ok=True, parents=True)

    # Request the OCR data for this page
    try:
        response = s.get(f"https://nla.gov.au/{page_id}/ocr")
        data = response.json()
    except requests.exceptions.ContentDecodingError:
        print(page_id, "no ocr")
    else:
        # Extract the coordinates of illustrations from the OCR data
        ill_coords = get_ill_coords(data, ill_types)
        if ill_coords:
            # Need the page size to calculate the ratio and update coords
            if not page_size:
                page_size = get_page_size(page_id)
            # Download the page image
            response = s.get(f"https://nla.gov.au/{page_id}/image")
            img = Image.open(BytesIO(response.content))

            # Get the image dimensions
            width, height = img.size
            # Calculate the ratio of image dimensions to page dimensions
            try:
                ratio = width / page_size["width"]
            # Some pages have 0 for dimensions
            except ZeroDivisionError:
                print(page_id, "no dimensions")
                # Is this a useful fallback?
                ratio = 1

            # Loop through all coords
            for i, coords in enumerate(ill_coords):
                # Add some extra space at the bottom to try and include the caption
                coords[3] = get_caption(data, coords, caption_lines)
                # Update the coords using ration
                coords = [c * ratio for c in coords]
                # Crop image using the coords
                ill_img = img.crop(tuple(coords))
                # Save the cropped image
                file_name = f"{page_id}-{i}.jpg"
                ill_img.save(Path(output_path, file_name))
                saved_images.append(file_name)
        if not response.from_cache:
            time.sleep(0.2)
    return saved_images


def save_illustrations_from_issue(
    issue_id, ill_types=ILL_TYPE_DEFAULTS, caption_lines=0
):
    """
    Save all illustrations of the specified type from the supplied issue.
    """
    output_path = Path("illustrations", "issues", issue_id)
    output_path.mkdir(exist_ok=True, parents=True)
    metadata = get_metadata(issue_id)
    for page in metadata["children"]["page"]:
        save_illustrations_from_page(
            page["pid"],
            page["copies"][0]["technicalmetadata"],
            output_path,
            ill_types,
            caption_lines,
        )


def save_illustrations_from_article(
    article_id,
    output_path=None,
    ill_types=ILL_TYPE_DEFAULTS,
    caption_lines=0,
):
    """
    Save all illustrations of the specified type from the supplied article.
    """
    # print(article_id)
    if not output_path:
        output_path = Path("illustrations", "articles", article_id)
        output_path.mkdir(exist_ok=True, parents=True)
    # Get embedded metadata from viewer
    metadata = get_metadata(article_id)
    # Loop through all article records in metadata
    pages = []
    saved_images = []
    for article in metadata.get("children", {}).get("article", []):
        # When we find the article we want, save the list of pages it appears on
        if article["pid"] == article_id:
            pages = [p["page"] for p in article.get("existson", [])]
            break
    # Loop through pages
    for page_id in pages:
        page_size = None
        # Look for page info in metadata
        for page in metadata.get("children", {}).get("page", []):
            # When we find the page in metadata, save the page size details
            if page["pid"] == page_id:
                page_size = page["copies"][0]["technicalmetadata"]
                break
        if page_size:
            # Save illustrations from current page
            file_names = save_illustrations_from_page(
                page_id, page_size, output_path, ill_types, caption_lines
            )
            saved_images += file_names
    # if not saved_images:
    # print(article_id)


def get_total_results(params, headers):
    """
    Get the total number of results for a search.
    """
    these_params = params.copy()
    these_params["n"] = 0
    response = s.get(
        "https://api.trove.nla.gov.au/v3/result", params=these_params, headers=headers
    )
    data = response.json()
    return int(data["category"][0]["records"]["total"])


def get_fulltext_url(links):
    """
    Loop through the identifiers to find links to full text (digitised) versions
    """
    urls = []
    for link in links:
        if (
            "linktype" in link
            and link["linktype"] == "fulltext"
            and "nla.obj" in link["value"]
        ):
            urls.append(link["value"])
    return urls


def convert_ill_types(ill_types):
    """
    Convert illustrations types from web interface
    """
    converted_types = []
    for t in ill_types:
        converted_types += ILL_TYPES[t]
    return converted_types
    # return [ILL_TYPES[t] for t in ill_types]


def make_output_path(params, ill_types):
    query_values = [
        v
        for k, v in params.items()
        if k == "q"
        or (k.startswith("l-") and k not in ["l-IllustrationType", "l-category"])
    ]
    output_dir = slugify(" ".join(query_values + ill_types))
    output_path = Path("illustrations", output_dir)
    output_path.mkdir(exist_ok=True, parents=True)
    return output_path


def save_illustrations_from_search(query_params, ill_types=[], caption_lines=0):
    """
    Harvest illustrations from articles found using the supplied search parameters.
    """
    # Default params needed for API
    default_params = {
        "include": "links",
        "encoding": "json",
        "bulkHarvest": "true",
        "n": 100,
    }
    # Combine query and default params
    params = query_params | default_params
    # Add api key to request headers
    headers = {"X-API-KEY": API_KEY}

    # If illustration types are not specified look for them in facets and convert
    if not ill_types and "l-illustrationType" in params:
        ill_types = convert_ill_types([params["l-illustrationType"]])
    elif not ill_types and params.get("l-category") == "Advertising":
        ill_types = ["Advertisement"]
    # Otherwise use all illustration types
    elif not ill_types:
        ill_types = ILL_TYPE_DEFAULTS
    # Set path to save images
    output_path = make_output_path(params, ill_types)

    # Get the total number of results for progress bar
    total = get_total_results(params, headers)
    # Initial start value, will be replaced after each request
    start = "*"
    with tqdm(total=total) as pbar:
        # Continue until there's no start value
        while start:
            # Add start value to params
            params["s"] = start
            # Make search API request
            response = s.get(
                "https://api.trove.nla.gov.au/v3/result",
                params=params,
                headers=headers,
            )
            data = response.json()

            # Loop through the search results
            items = data["category"][0]["records"]["work"]
            for item in items:
                # Get all the article urls
                # If articles are grouped there can be multiple urls in a single work record
                urls = get_fulltext_url(item.get("identifier", []))
                # Loop through urls saving illustrations from each article
                for url in urls:
                    # Extract nla.obj id from url
                    article_id = url.strip("/").split("/")[-1]
                    save_illustrations_from_article(article_id, output_path, ill_types)
                pbar.update(1)

            # Try to get the next start value
            try:
                start = data["category"][0]["records"]["nextStart"]
            except KeyError:
                start = None

save_illustrations_from_page("nla.obj-375667954")

save_illustrations_from_issue("nla.obj-69915406", caption_lines=5)

save_illustrations_from_issue("nla.obj-2623004085", caption_lines=0)

query_params = {
    "q": "title:cat OR title:kitten OR title:cats OR title:kittens",
    "category": "magazine",
    "l-illustrationType": "Photo",
}

save_illustrations_from_search(query_params)

img_path = make_output_path(query_params, ["Content_Photo"])
img = next(Path(img_path).glob("*.jpg"))
display(Display_Image(img, width=300))

query_params = {
    "q": "",
    "category": "magazine",
    "l-title": "The Home : an Australian quarterly",
    "l-category": "Advertising",
}

save_illustrations_from_search(query_params)

# IGNORE TESTING ONLY
if os.getenv("GW_STATUS") == "dev":
    query_params = {
        "q": "wragge",
        "category": "magazine",
        "l-title": "Walkabout",
        "l-illustrationType": "Photo",
    }
    save_illustrations_from_search(query_params)

Harvest illustrations from periodicals¶

OCR data¶

Illustration types¶

Page resolution¶

Captions¶

What's missing?¶

File names and links¶

Import what we need¶

Define some functions¶

Examples¶

Save illustrations from a single periodical page¶

Save illustrations from a periodical issue¶

Save illustrations from a search for cat photos¶

Save all the advertisements from a periodical¶

Web interface	OCR data
Illustrated	Illustration
Photo	Content_Photo
Map	Content_Map
Graph	Content_Graph
Cartoon	Content_Cartoon
Music	Content_Music