import json
import os
import re
import shutil
import tempfile
import time
from io import BytesIO
from operator import itemgetter
from pathlib import Path

import pandas as pd
import requests
import yaml
from dotenv import load_dotenv
from IPython.display import HTML
from lat_lon_parser import parse
from PIL import Image, UnidentifiedImageError
from PIL.ImageOps import contain
from pymarc import JSONReader
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
from trove_newspaper_images.articles import download_images

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))

load_dotenv()

True

# Insert your Trove API key between the quotes
API_KEY = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")

# Paste your list id between the quotes
list_id = "83777"

# Get Geo areas data
response = s.get(
    "https://raw.githubusercontent.com/GLAM-Workbench/marc-geographicareas/main/marc_geographicareas.json"
)
GEO_AREAS = response.json()


def get_metadata(id):
    """
    Extract work data in a JSON string from the work's HTML page.
    """
    if not id.startswith("http"):
        id = "https://nla.gov.au/" + id
    response = requests.get(id)
    try:
        work_data = re.search(
            r"var work = JSON\.parse\(JSON\.stringify\((\{.*\})", response.text
        ).group(1)
    except AttributeError:
        work_data = "{}"
    return json.loads(work_data)


def parse_marc(metadata):
    """
    Parse the bibliographic MARC data in the embedded metadata.
    This produces a structure that can be loaded into PyMarc's JSON reader.
    """
    # Some nla.obj items don't have MARC data
    # For example some collections
    try:
        records = metadata["marcData"]["record"]
    except KeyError:
        return {}

    # The metadata contains bibliographic and holdings MARC data
    # here we'll select the bib record.
    for record in records:
        if record["leader"].get("type") == "Bibliographic":
            break

    fields = []
    # Control fields only have content, no subfields
    for cf in record.get("controlfield", []):
        fields.append({str(cf["tag"]): str(cf["content"])})

    # Loop through all the fields
    for field in record["datafield"]:
        subfields = []
        # Get any subfields
        sfs = field.get("subfield", [])
        # The subfields value can be a list or dict
        # Check if it's a list
        if isinstance(sfs, list):
            # Loop through the subfields adding the values
            for sf in sfs:
                subfields.append({sf["code"]: str(sf["content"])})
        # If it's not a list just add the details from the dict
        else:
            subfields.append({sfs["code"]: str(sfs["content"])})
        fields.append(
            {
                str(field["tag"]): {
                    "subfields": subfields,
                    "ind1": field["ind1"],
                    "ind2": field["ind2"],
                }
            }
        )

    return [{"leader": record["leader"]["content"], "fields": fields}]


def get_url(identifiers, linktype):
    """
    Loop through the identifiers to find the requested type of url.
    """
    for identifier in identifiers:
        if identifier["linktype"] == linktype:
            url = identifier["value"]
            return url


def save_as_csv(list_dir, data, data_type):
    df = pd.DataFrame(data)
    df["pages"] = df["pages"].astype("Int64")
    df.to_csv(Path(list_dir, "_data", f"{list_id}-{data_type}.csv"), index=False)


def get_list(list_id):
    """
    Get a List record from the API.
    """
    list_url = f"https://api.trove.nla.gov.au/v3/list/{list_id}?encoding=json&reclevel=full&include=listItems"
    response = s.get(list_url, headers={"X-API-KEY": API_KEY})
    return response.json()


def get_article(id):
    """
    Get a newspaper article record from the API
    """
    article_api_url = f"https://api.trove.nla.gov.au/v3/newspaper/{id}/?encoding=json&reclevel=full&include=tags"
    response = s.get(article_api_url, headers={"X-API-KEY": API_KEY})
    return response.json()


def get_work(id):
    """
    Get a work record from the API.
    """
    article_api_url = f"https://api.trove.nla.gov.au/v3/work/{id}/?encoding=json&reclevel=full&include=workVersions,tags,links"
    response = s.get(article_api_url, headers={"X-API-KEY": API_KEY})
    return response.json()


def make_dirs(list_id):
    """
    Create directories to store the outputs.
    """
    list_dir = Path("cb-exhibitions", list_id)
    list_dir.mkdir(parents=True, exist_ok=True)
    Path(list_dir, "objects").mkdir(exist_ok=True)
    # Path(list_dir, "temp").mkdir(exist_ok=True)
    Path(list_dir, "_data").mkdir(exist_ok=True)
    return list_dir


def get_all_tags(work):
    tags = work.get("tag", [])
    for version in work["version"]:
        for record in version["record"]:
            for tag in record.get("tag", []):
                tags.append(tag["value"])
    return tags


def get_subjects(work):
    subjects = []
    if "subject" in work:
        subjects = work.get("subject", [])
    else:
        subjects = []
    subjects += get_all_tags(work)
    return subjects


def get_work_page_id(url):
    nla_id = re.search(r"https?://nla.gov.au/(nla.obj-\d+)", url).group(1)
    metadata = get_metadata(url)
    if metadata["pid"] != nla_id:
        for article in metadata.get("children", {}).get("article", []):
            if article["pid"] == nla_id:
                page_ids = [p["page"] for p in article.get("existson", [])]
                return page_ids
    return [nla_id]


def get_work_image_urls(record):
    fulltext_url = get_url(record.get("identifier", ""), "fulltext")
    if fulltext_url and "nla.obj" in fulltext_url:
        page_ids = get_work_page_id(fulltext_url)
        image_urls = [f"https://nla.gov.au/{p}/image" for p in page_ids]
    elif image_url := get_url(record.get("identifier", ""), "viewcopy"):
        image_urls = [image_url]
    elif image_url := get_url(record.get("identifier", ""), "thumbnail"):
        image_urls = [image_url]
    else:
        image_urls = []
    return image_urls


def save_work_images(list_dir, record, max_size=1200):
    filenames = []
    image_urls = get_work_image_urls(record)
    for i, image_url in enumerate(image_urls):
        filename = Path(list_dir, "objects", f"work-{record.get('id', '')}-{i}.jpg")
        filenames.append(filename)
        if not filename.exists():
            response = s.get(image_url)
            if response.status_code == 200:
                img = Image.open(BytesIO(response.content))
                if max_size:
                    img = contain(
                        img, (max_size, max_size), method=Image.Resampling.LANCZOS
                    )
                img.save(filename, "JPEG")
    return filenames


def get_article_tags(record):
    subjects = []
    article = get_article(record["id"])
    for tag in article.get("tag", []):
        subjects.append(tag["value"])
    return subjects


def get_value(record, field, keys=["value"]):
    """
    Get the values of a field.
    Some fields are lists of dicts, if so use the `key` to get the value.
    """
    value = record.get(field, [])
    if value and isinstance(value[0], dict):
        for key in keys:
            try:
                return [re.sub(r"\s+", " ", v[key]) for v in value]
            except KeyError:
                pass
    else:
        return value


def flatten_values(record, field, key="type"):
    """
    If a field has a value and type, return the values as strings with this format: 'type: value'
    """
    flattened = []
    values = record.get(field, [])
    for value in values:
        if key in value:
            flattened.append(f"{value[key]}: {value['value']}")
        else:
            flattened.append(value["value"])
    return flattened


def get_locations(work):
    """
    Get locations from the spatial field.
    If they are LoC GeographicAreas then get the place labels from mappings.
    """
    locations = []
    for location in work.get("spatial", []):
        if location.get("scheme") == "http://id.loc.gov/vocabulary/geographicAreas":
            locations.append(GEO_AREAS[location["value"].strip("-")]["place"])
        else:
            locations.append(location["value"])
    return locations


def has_type(work, format_type):
    """
    Check the metadata for a specific format value.
    """
    for ft in work.get("type", []):
        if format_type in ft:
            return True
    return False


def check_coord(value, lat_lon):
    """
    Make sure that lat/longs are within expected range.
    Drop values if outside range.
    """
    if lat_lon == "lat" and abs(value) <= 90:
        return value
    elif lat_lon == "lon" and abs(value) <= 180:
        return value
    else:
        raise ValueError
    return None


def get_center(parsed):
    """
    Get the centre of a bounding box.
    Returns point coords.

    See: https://gis.stackexchange.com/a/394860
    """
    e, w, n, s = itemgetter("east", "west", "north", "south")(parsed)
    width = max(w, e) - min(w, e)
    # get the box height
    height = max(s, n) - min(s, n)
    # compute the center
    center = check_coord(round(min(s, n) + height / 2, 4), "lat"), check_coord(
        round(min(w, e) + width / 2, 4), "lon"
    )
    return center


def parse_value(value):
    """
    Parse latitude or longitude values.
    """
    values = value.split("--")
    # Sometimes single hyphens are used
    if len(values) == 1:
        values = value.split("-")
    coords = [parse(v) for v in values]
    return sorted(coords)


def parse_coords(coords):
    """
    Parses a coordinate string, converting values to decimal.

    For points -- returns latitude and longitude.
    For boxes -- returns centre of box as latitude, longitude, and bounds as east, west, north, and south.
    """
    parsed = {}
    # Default values
    for c in ["east", "west", "north", "south", "latitude", "longitude"]:
        parsed[c] = None
    try:
        # Split string into lat and long using /
        long, lat = coords.split("/")
        if long.startswith("N"):
            long, lat = lat, long
        longs = parse_value(long)
        lats = parse_value(lat)
    except (ValueError, TypeError):
        pass
    else:
        try:
            # Bounding box
            if len(longs) == 2 and len(lats) == 2:
                parsed["east"] = check_coord(longs[-1], "lon")
                parsed["west"] = check_coord(longs[0], "lon")
                parsed["north"] = check_coord(lats[-1], "lat")
                parsed["south"] = check_coord(lats[0], "lat")
                # Get centre of bounding box
                latitude, longitude = get_center(parsed)
                parsed["latitude"] = latitude
                parsed["longitude"] = longitude
            # Point
            elif len(longs) == 1 and len(lats) == 1:
                parsed["latitude"] = check_coord(lats[0], "lat")
                parsed["longitude"] = check_coord(longs[0], "lon")
        except ValueError:
            pass
    return parsed


def get_coords(work):
    fulltext_url = get_url(work.get("identifier", []), "fulltext")
    if fulltext_url and "nla.obj" in fulltext_url and has_type(work, "Map"):
        metadata = get_metadata(fulltext_url)
        marc_json = parse_marc(metadata)
        # PyMARC expects a JSON string so we dump it to a string first
        reader = JSONReader(json.dumps(marc_json))
        for record in reader:
            if coord_string := record["255"]["c"]:
                if coords := parse_coords(coord_string):
                    return coords
    for location in work.get("spatial", []):
        if location.get("scheme") == "http://id.loc.gov/vocabulary/geographicAreas":
            place = GEO_AREAS[location["value"].strip("-")]
            coords = place.get("coordinates", [])
            for coord in coords:
                lat, lon = coord.split(",")
                return {"latitude": float(lat), "longitude": float(lon)}
    return {}


def update_config(list_data, list_dir):
    with Path("cb-config", "_config.yml").open("r") as config_in:
        config = yaml.safe_load(config_in)
    config["title"] = list_data["title"]
    config["author"] = list_data["creator"].replace("public:", "")
    config["metadata"] = f'{list_data["id"]}-items'
    with Path(list_dir, "_config.yml").open("w") as config_out:
        config_out.write(yaml.dump(config))


def harvest_list(list_id, max_size=1200):
    list_dir = make_dirs(list_id)
    data = get_list(list_id)
    update_config(data, list_dir)
    items = []
    for item in tqdm(data["listItem"]):
        for zone, record in item.items():
            if zone == "work":
                # Some fields aren't included in the list data, so get the full work record
                work_data = get_work(record["id"])
                coords = get_coords(work_data)
                work = {
                    "objectid": f"work-{record.get('id', '')}",
                    "parentid": "",
                    "title": record.get("title", ""),
                    "type": ";".join(get_value(work_data, "type")),
                    "date": record.get("issued", ""),
                    "creator": "; ".join(get_value(work_data, "contributor")),
                    "is_part_of": "; ".join(flatten_values(work_data, "isPartOf")),
                    "publication_place": "; ".join(
                        get_value(work_data, "placeOfPublication")
                    ),
                    "trove_url": record.get("troveUrl", ""),
                    "source_url": get_url(record.get("identifier", ""), "fulltext"),
                    "abstract": record.get("abstract", ""),
                    "description": item.get("note", ""),
                    "subject": "; ".join(get_subjects(work_data)),
                    "extent": work_data.get("extent", ""),
                    "format": "; ".join(get_value(work_data, "format")),
                    "language": "; ".join(get_value(work_data, "language")),
                    "rights": "; ".join(get_value(work_data, "rights")),
                    # coordinates for maps?
                    "location": "; ".join(get_locations(work_data)),
                    "latitude": coords.get("latitude", ""),
                    "longitude": coords.get("longitude", ""),
                }
                image_filenames = save_work_images(list_dir, work_data, max_size)
                if len(image_filenames) > 1:
                    work["format"] = "compound_object"
                    items.append(work)
                    for i, image_file in enumerate(image_filenames):
                        child_work = work.copy()
                        child_work["parentid"] = f"work-{record.get('id', '')}"
                        child_work["objectid"] = f"work-{record.get('id', '')}-{i}"
                        child_work["filename"] = image_file.name
                        child_work["format"] = "image/jpeg"
                        items.append(child_work)
                elif len(image_filenames) == 1:
                    work["filename"] = image_filenames[0].name
                    work["format"] = "image/jpeg"
                    items.append(work)
                else:
                    work["format"] = "record"
                    items.append(work)
            elif zone == "article":
                newspaper_id = record.get("title", {}).get("id")
                newspaper_title = record.get("title", {}).get("title")
                newspaper_link = f'<a href="http://nla.gov.au/nla.news-title{newspaper_id}">{newspaper_title}</a>'
                # citation =
                article = {
                    "objectid": f"article-{record.get('id', '')}",
                    "parentid": "",
                    "title": record.get("heading", ""),
                    "date": record.get("date", ""),
                    "is_part_of": newspaper_link,
                    "pages": record.get("pageSequence", ""),
                    "trove_url": f'http://nla.gov.au/nla.news-article{record.get("id")}',
                    "type": "Newspaper article",
                    "description": item.get("note", ""),
                    "subject": "; ".join(get_article_tags(record)),
                    "location": "",
                    "latitude": "",
                    "longitude": "",
                }
                with tempfile.TemporaryDirectory() as dirpath:
                    images = []
                    tries = 0
                    # Trove has had some issues loading newspaper images lately
                    # This is an attempted workaround
                    while not images and tries < 2:
                        try:
                            images = download_images(record["id"], dirpath, masked=True)
                        except UnidentifiedImageError:
                            time.sleep(5)
                            tries += 1
                    # Use a page image if it can't get an article?
                    if len(images) > 1:
                        article["format"] = "compound_object"
                        items.append(article)
                        for i, image in enumerate(images):
                            if max_size:
                                img = Image.open(Path(dirpath, image))
                                img = contain(
                                    img,
                                    (max_size, max_size),
                                    method=Image.Resampling.LANCZOS,
                                )
                                img.save(Path(list_dir, "objects", image), "JPEG")
                            else:
                                shutil.copy(
                                    Path(dirpath, image),
                                    Path(list_dir, "objects", image),
                                )
                            child_article = article.copy()
                            child_article["format"] = "image/jpeg"
                            child_article["parentid"] = (
                                f"article-{record.get('id', '')}"
                            )
                            child_article["objectid"] = (
                                f"article-{record.get('id', '')}-{i}"
                            )
                            child_article["filename"] = image
                            items.append(child_article)
                    elif len(images) == 1:
                        article["format"] = "image/jpeg"
                        article["filename"] = images[0]
                        items.append(article)
                    else:
                        article["format"] = "record"
                        items.append(article)
    if items:
        save_as_csv(list_dir, items, "items")
    return items

items = harvest_list(list_id)

  0%|          | 0/30 [00:00<?, ?it/s]

list_dir = Path("cb-exhibitions", list_id)
shutil.make_archive(list_dir, "zip", list_dir)
HTML(f'<a download="{list_id}.zip" href="{list_dir}.zip">Download your files</a>')

Convert a Trove list into a CollectionBuilder exhibition¶

1. What you need¶

2. Setup a GitHub repository for your exhibition¶

3. Enable GitHub Pages for your repository¶

4. Generate your exhibition files from your Trove list¶

5. Add more metadata (optional)¶

6. Replace tiny images (optional)¶

7. Upload your files to the exhibition repository¶

8. Further customisation¶

Annotating Trove list items¶

Add your API key and list ID values¶

Define some functions¶

Let's do it!¶

Download the results¶