import json
import os
import re
import time
from datetime import datetime, timedelta
from pathlib import Path

import requests_cache
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from iiif_prezi3 import Manifest, Range, config
from iiif_prezi.factory import ManifestFactory
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

s = requests_cache.CachedSession(expire_after=timedelta(days=30))
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

load_dotenv()

# GETTING DATA FROM TROVE


def harvest_collection_items(collection_id, include_subcollections=False):
    """
    Harvest all the items in a Trove collection (including any sub-collections)
    by scraping the item identifiers from the 'Browse collection' pop-up.
    See the Trove Data Guide:
    """
    # The initial startIdx value
    start = 0
    # Number of results per page, used to increment the startIdx value
    n = 20
    items = []
    # If there aren't 20 results on the page then we've reached the end, so continue harvesting until that happens.
    while n == 20:
        url = f"https://nla.gov.au/{collection_id}/browse?startIdx={start}&rows=20&op=c"
        # Get the browse page
        response = s.get(url)

        # Beautifulsoup turns the HTML into an easily navigable structure
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all the divs containing issue details and loop through them
        details = soup.find_all(class_="l-item-info")
        for detail in details:
            # Set a default type
            item_type = "item"

            # Look for the a tag with class "obj-reference content"
            item_id = detail.find(
                lambda tag: tag.name == "a"
                and tag.get("class") == ["obj-reference", "content"]
            )["href"].strip("/")

            # Look for a link to 'children', indicating it's a subcollection (or a book or issue with pages)
            has_children = detail.find(
                lambda tag: tag.name == "a" and tag.get("class") == ["obj-reference"]
            )

            # If it has children, harvest items from the subcollection
            if has_children and include_subcollections is True:
                item_type = "collection"
                # items += harvest_collection_items(item_id, include_subcollections=True)
                children = harvest_collection_items(
                    item_id, include_subcollections=True
                )
            else:
                children = []

            # Save the item
            # The parent_id will enable us to identify items that are in subcollections
            items.append(
                {
                    "item_id": item_id,
                    "item_type": item_type,
                    "parent_id": collection_id,
                    "children": children,
                }
            )
        if not response.from_cache:
            time.sleep(0.2)
        # Increment the startIdx
        start += n
        # Set n to the number of results on the current page
        n = len(details)
    return items


def prepare_url(url):
    """
    Make sure nla.obj identifiers are properly formatted urls.
    """
    url = re.sub(r"https?://nla/", "https://nla.gov.au/", url)
    url = url.replace("\\\\", "//")
    if not url.startswith("http"):
        # print(url)
        url = f"https://nla.gov.au/{url.strip('/')}"
    return url


def get_work_data(url):
    """
    Extract work data in a JSON string from the work's HTML page.
    """
    url = prepare_url(url)
    try:
        response = s.get(url)
    except ConnectionError:
        print(url)
    if response.ok:
        try:
            work_data = re.search(
                r"var work = JSON\.parse\(JSON\.stringify\((\{.*\})", response.text
            ).group(1)
        except AttributeError:
            work_data = "{}"
    else:
        print(url)
        work_data = "{}"
    if not response.from_cache:
        time.sleep(0.2)
    return json.loads(work_data)


def get_page_title(url):
    """
    Extract the value of the title tag from a HTML page.
    """
    url = prepare_url(url)
    try:
        response = s.get(url)
    except ConnectionError:
        print(url)
    if response.ok:
        soup = BeautifulSoup(response.text)
        return str(soup.title.string).strip()
    return "No title"


# METADATA PROCESSING FOR BOTH V2 and V3


def clean_date(date):
    """
    Extract the year from a date string.
    """
    try:
        year = re.search(r"\d{4}$", date).group(0)
    except AttributeError:
        year = ""
    return year


def get_date_range(metadata):
    """
    Get the year from start and end date fields (if they exist).
    Save the values to a list.
    """
    dates = [
        clean_date(metadata.get("startDate", "")),
        clean_date(metadata.get("endDate", "")),
    ]
    return [d for d in dates if d]


def round_dimensions(dimensions, max=5000):
    """
    Round dimensions to a given maximum.
    Images in Trove normally have a limit of 5000 pixels on longest dimension,
    but the dimensions in the metadata are often larger. This reduces the dimensions
    in the metadata to relect the likely size of the downloadable image.
    """
    height = dimensions["height"]
    width = dimensions["width"]
    if height <= max and width <= max:
        return dimensions
    if height > width:
        ratio = max / height
        height = max
        width = round(width * ratio)
    else:
        ratio = max / width
        width = max
        height = round(height * ratio)
    return {"height": height, "width": width}


def get_dimensions(item_metadata):
    """
    Get the dimensions of the image access copy from item metadata.
    """
    try:
        dimensions = item_metadata["copies"][0]["technicalmetadata"]
        dimensions = round_dimensions(dimensions)
    except KeyError:
        dimensions = {"height": 5000, "width": 5000}
    return dimensions


def prepare_item_title(item_metadata, index, using_parent=False):
    """
    Combine item title and other metadata to create a suitable title for a canvas.
    If available, the title will include:
    - subUnit info (eg Item 1001)
    - page number
    """
    title = item_metadata.get("title", "No title")
    sub_unit = [item_metadata.get("subUnitType"), item_metadata.get("subUnitNo")]
    sub_unit = [s for s in sub_unit if s]
    sub_unit_str = " ".join([s for s in sub_unit if s])
    if sub_unit_str and using_parent:
        sub_unit_str = f"{sub_unit_str}, page {index}"
    elif using_parent or item_metadata.get("form") == "Book":
        sub_unit_str = f"page {index}"
    if sub_unit_str:
        title = f"{title} ({sub_unit_str})"
    return title


def prepare_collection_title(item_metadata, item):
    """
    Combine item title and other metadata to create a suitable title for a range.
    If available, the title will include:
    - subUnit info (eg Series 1)
    - number of child ranges
    - number of child canvases
    """
    title = item_metadata.get("title", "No title")
    sub_unit = [
        item_metadata.get("subUnitType"),
        item_metadata.get("subUnitNo"),
    ]
    sub_unit_str = " ".join([s for s in sub_unit if s])
    sc_num = len([c for c in item["children"] if c["item_type"] == "collection"])
    page_num = len([c for c in item["children"] if c["item_type"] == "item"])
    contents = []
    if sub_unit_str:
        contents.append(sub_unit_str)
    if sc_num:
        contents.append(f"{sc_num} items")
    if page_num:
        contents.append(f"{page_num} pages")
    if contents:
        title = f"{title} ({', '.join(contents)})"
    return title


def prepare_metadata(item_metadata):
    """
    Extract useful metadata from the item metadata and save to
    a dict for addition to manifest.
    """
    metadata = {}
    fields = {
        "type": "form",
        "creator": "creator",
        "publisher": "publisherName",
        "extent": "extent",
        "rights": "rights",
        "call number": "holdingNumber",
    }
    for k, v in fields.items():
        if val := item_metadata.get(v):
            metadata[k] = val
    dates = get_date_range(item_metadata)
    if dates:
        metadata["date"] = "-".join(dates)
    return metadata


# CODE FOR IIIF PRESENTATION API V3


def add_metadata_v3(iiif_obj, item_metadata):
    """
    Add metadata from the item to the IIIF object.
    """
    metadata = prepare_metadata(item_metadata)
    for k, v in metadata.items():
        iiif_obj.add_metadata(k, v)


def add_item_to_manifest(manifest, item, index):
    """
    Add an item as a canvas to the manifest.
    """
    using_parent = False
    item_id = item["item_id"]
    item_url = f"https://nla.gov.au/{item_id}"

    item_metadata = get_work_data(item_id)

    # Presence of 'copies' indicates there's an image attached.
    # This will include both items and 'collections' that have images
    # attached that are not included amonst its children.
    if "copies" in item_metadata:
        dimensions = get_dimensions(item_metadata)

        # If there's no title in the item metadata, it's
        # probably the child of a subcollection.
        # Use the parent metadata instead.
        if not item_metadata.get("title"):
            using_parent = True
            try:
                item_metadata = item["parent"]
            except KeyError:
                item_metadata = get_work_data(item["parent_id"])

        title = prepare_item_title(item_metadata, index, using_parent)

        # Create the canvas and add metadata
        canvas = manifest.make_canvas(
            id=item_url,
            label=title,
        )
        canvas.set_hwd(height=dimensions["height"], width=dimensions["width"])
        canvas.add_thumbnail(f"{item_url}-t", format="image/jpeg")
        add_homepage(canvas, item_id)
        add_metadata_v3(canvas, item_metadata)

        # Add the image to the canvas
        canvas.add_image(
            image_url=f"{item_url}/image",
            anno_page_id=f"{item_url}/page/",
            anno_id=f"{item_url}/annotation/",
            format="image/jpeg",
            height=dimensions["height"],
            width=dimensions["width"],
        )


def add_items(manifest, items):
    """
    Loop through collection items adding them as canvases to the manifest.
    """
    for index, item in enumerate(items, 1):
        add_item_to_manifest(manifest, item, index)
        # Items can be nested in subcollections, so recurse back through children.
        add_items(manifest, item["children"])


def add_range(item):
    """
    Create a range from a subcollection.
    """
    item_id = item["item_id"]
    item_metadata = get_work_data(item_id)
    # Some 'collections' have images attached that aren't amongst their children
    # If there's a 'copies' attribute in the metadata, we'll add the item as one of it's own children
    if "copies" in item_metadata:
        item["children"].insert(0, {"item_id": item_id, "item_type": "item"})
    title = prepare_collection_title(item_metadata, item)
    rng = Range(id=f"https://nla.gov.au/{item_id}/range", label=title)
    for child in item["children"]:
        if child["item_type"] == "collection":
            rng.add_item(add_range(child))

        elif child["item_type"] == "item":
            child_id = f"https://nla.gov.au/{child['item_id']}"
            rng.add_item({"id": child_id})
    return rng


def add_ranges(mf, items):
    """
    Work through a list of collection items, adding ranges
    for any subcollections.
    """
    for item in items:
        if item["item_type"] == "collection":
            mf.add_range(add_range(item))


def add_homepage(iiif_obj, trove_id):
    """
    Add a Trove url in a homepage record attached to the supplied IIIF object.
    """
    homepage = {
        "id": f"https://nla.gov.au/{trove_id}",
        "type": "Text",
        "label": {"en": ["View in Trove"]},
        "format": "text/html",
    }
    iiif_obj.homepage = homepage


def create_manifest_v3(coll_id, repo=None):
    """
    Build a manifest conforming to v3 of the IIIF Presentation API.
    Harvests metadata from all the items within the specified collection,
    and assembles them as a manifest, with each digitised image included as a canvas.
    """
    coll_metadata = get_work_data(coll_id)

    # If there's no metadata in the page (such as with a finding aid)
    # get the page title.
    if not coll_metadata:
        coll_metadata = {"title": get_page_title(coll_id)}
    config.configs["helpers.auto_fields.AutoLang"].auto_lang = "en"

    # Construct the manifest and add metadata
    if repo:
        manifest_id = f"{repo.strip('/')}/{coll_id}-v3-manifest.json"
    else:
        manifest_id = (
            f"https://glam-workbench.net/trove-images/{coll_id}-v3-manifest.json"
        )
    manifest = Manifest(id=manifest_id, label=coll_metadata.get("title", "No title"))
    manifest.summary = f"This manifest was generated on {datetime.now().strftime('%d %b %Y')} by harvesting collection metadata from the Trove website."
    add_homepage(manifest, coll_id)
    add_metadata_v3(manifest, coll_metadata)

    # Get items in this collection and construct item list and range structures.
    items = harvest_collection_items(coll_id, include_subcollections=True)
    add_items(manifest, items)
    add_ranges(manifest, items)

    # print(manifest.json(indent=2))
    Path("manifests").mkdir(exist_ok=True)
    Path("manifests", f"{coll_id}-v3-manifest.json").write_text(manifest.json())


# CODE FOR IIIF V2 PRESENTATION API


def add_metadata_v2(iiif_obj, item_metadata):
    """
    Add metadata to an v2 IIIF object (manifest or canvas)
    """
    metadata = prepare_metadata(item_metadata)
    if metadata:
        iiif_obj.set_metadata(metadata)


def add_item_to_seq(seq, item, index):
    """
    Add a canvas to the sequence of canvases.
    Obtains metadata for the given item, then uses this
    to create a canvas.
    """
    item_id = item["item_id"]
    item_url = f"https://nla.gov.au/{item_id}"
    item_metadata = get_work_data(item_id)

    # Presence of 'copies' indicates there's an image attached.
    # This will include both items and 'collections' that have images
    # attached that are not included amonst its children.
    if "copies" in item_metadata:
        using_parent = False
        dimensions = get_dimensions(item_metadata)

        # If there's no title in the item metadata, it's
        # probably the child of a subcollection.
        # Use the parent metadata instead.
        if not item_metadata.get("title"):
            using_parent = True
            try:
                item_metadata = item["parent"]
            except KeyError:
                item_metadata = get_work_data(item["parent_id"])

        # Build the Canvas and add metadata
        title = prepare_item_title(item_metadata, index, using_parent)
        canvas = seq.canvas(ident=item_url, label=title)
        canvas.set_hw(dimensions["height"], dimensions["width"])
        add_metadata_v2(canvas, item_metadata)
        canvas.thumbnail = {"@id": f"{item_url}-t"}

        # Add image info to the canvas
        anno = canvas.annotation(ident=f"{item_url}/view")
        img = anno.image(ident=f"{item_url}/image", iiif=False)
        img.format = "image/jpeg"
        img.set_hw(dimensions["height"], dimensions["width"])
    return seq


def build_sequence(seq, items):
    """
    Build a list or sequence of canvases which will be added to the manifest.
    A canvas is generated for each item in the collection.
    """
    for index, item in enumerate(items, 1):
        seq = add_item_to_seq(seq, item, index)
        # Recurse back through children to add nested canvases
        build_sequence(seq, item["children"])


def build_structure(mf, items, toc=None):
    """
    Build a Table of Contents structure listing subcollections (if any).
    This consistents of a series of 'ranges'. Each range can contain more ranges, as
    well as canvases. There's no a lot of documentation about this, so there might be easier
    approaches, but it seems to work ok.
    """
    for item in items:
        item_id = item["item_id"]
        if item["item_type"] == "collection":
            # Get metadata from the sub-collection web page
            item_metadata = get_work_data(item_id)
            # Some 'collections' have images attached that aren't amongst their children
            # If there's a 'copies' attribute in the metadata, we'll add the item as one of it's own children
            if "copies" in item_metadata:
                item["children"].insert(0, {"item_id": item_id, "item_type": "item"})
            # Prepare a title for the range
            title = prepare_collection_title(item_metadata, item)
            # Create the range wihin the manifest
            rng = mf.range(ident=f"https://nla.gov.au/{item_id}/range", label=title)
            # The toc is a top-level range which will be displayed in navigation
            if toc:
                toc.add_range(rng)
            # Loop through the children of this subcollection
            for child in item["children"]:
                child_id = f"https://nla.gov.au/{child['item_id']}"
                # Add child ranges to this range
                if child["item_type"] == "collection":
                    rng.add_range(child_id)
                # Add canvases to this range
                elif child["item_type"] == "item":
                    rng.add_canvas(child_id)
            # Recurse back through child subcollections adding them all the the manifest
            build_structure(mf, item["children"])


def create_manifest_v2(coll_id, repo=None):
    """
    Build a manifest conforming to v2 of the IIIF Presentation API.
    Harvests metadata from all the items within the specified collection,
    and assembles them as a manifest, with each digitised image included as a canvas.
    """
    coll_metadata = get_work_data(coll_id)
    items = harvest_collection_items(coll_id, include_subcollections=True)
    coll_url = f"https://nla.gov.au/{coll_id}"

    fac = ManifestFactory()

    if not coll_metadata:
        coll_metadata = {"title": get_page_title(coll_id)}

    if repo:
        manifest_id = f"{repo.strip('/')}/{coll_id}-v2-manifest.json"
    else:
        manifest_id = (
            f"https://glam-workbench.net/trove-images/{coll_id}-v2-manifest.json"
        )

    # Build the Manifest
    mf = fac.manifest(ident=manifest_id, label=coll_metadata.get("title"))
    add_metadata_v2(mf, coll_metadata)
    mf.attribution = (
        f"National Library of Australia (via Trove). See: https://nla.gov.au/{coll_id}"
    )
    mf.description = f"This manifest was generated on {datetime.now().strftime('%d %b %Y')} by harvesting collection metadata from the Trove website."
    mf.related = {"@id": coll_url, "label": "View in Trove"}

    # And walk through the pages
    build_sequence(mf.sequence(label="Normal Order"), items)

    # Create a top-level range to display the contents
    toc = mf.range(ident=f"{coll_url}/toc", label="TOC")
    toc.viewingHint = "top"
    # Create ranges for sub-collections
    build_structure(mf, items, toc)

    js = mf.toString(compact=True)

    # print(js)
    Path("manifests").mkdir(exist_ok=True)
    Path("manifests", f"{coll_id}-v2-manifest.json").write_text(js)

create_manifest_v3("nla.obj-140670968")

# IGNORE THIS CELL -- TESTING ONLY

if os.getenv("GW_STATUS") == "dev":
    create_manifest_v3("nla.obj-140670968")

collection	manifest	UV3	Mirador
B.A.N.Z. Antarctic Research Expedition photographs	nla.obj-141170265-v3-manifest.json	view in UV3	view in Mirador
The Papers of Sir Edmund Barton	nla.obj-224441684-v3-manifest.json	view in UV3	view in Mirador
Papers relating to the Federation Campaign (a single series from the Barton papers)	nla.obj-224441858-v3-manifest.json	view in UV3	view in Mirador
Postcard portraits of actresses, and of Australian towns, 1900s	nla.obj-224441858-v3-manifest.json	view in UV3	view in Mirador

Save a collection of digitised images as an IIIF manifest¶

Examples¶