import json
import mimetypes
import os
import re
import time
from datetime import datetime, timedelta
from pathlib import Path

import ipynbname
import nbformat
import requests_cache
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from IPython.display import HTML, display
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from rocrate.rocrate import ContextEntity, ROCrate
from tqdm.auto import tqdm

s = requests_cache.CachedSession(expire_after=timedelta(days=30))
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

load_dotenv()

def prepare_url(url):
    """
    Make sure nla.obj identifiers are properly formatted urls.
    """
    url = re.sub(r"https?://nla/", "https://nla.gov.au/", url)
    url = url.replace("\\\\", "//")
    if not url.startswith("http"):
        # print(url)
        url = f"https://nla.gov.au/{url.strip('/')}"
    return url


def get_work_data(url):
    """
    Extract work data in a JSON string from the work's HTML page.
    """
    url = prepare_url(url)
    try:
        response = s.get(url)
    except ConnectionError:
        print(url)
    if response.ok:
        try:
            work_data = re.search(
                r"var work = JSON\.parse\(JSON\.stringify\((\{.*\})", response.text
            ).group(1)
        except AttributeError:
            work_data = "{}"
    else:
        print(url)
        work_data = "{}"
    if not response.from_cache:
        time.sleep(0.2)
    return json.loads(work_data)


def harvest_collection_items(collection_id, include_subcollections=False):
    """
    Harvest all the items in a Trove collection (including any sub-collections)
    by scraping the item identifiers from the 'Browse collection' pop-up.
    See the Trove Data Guide:
    """
    # The initial startIdx value
    start = 0
    # Number of results per page, used to increment the startIdx value
    n = 20
    items = []
    # If there aren't 20 results on the page then we've reached the end, so continue harvesting until that happens.
    while n == 20:
        url = f"https://nla.gov.au/{collection_id}/browse?startIdx={start}&rows=20&op=c"
        # Get the browse page
        response = s.get(url)

        # Beautifulsoup turns the HTML into an easily navigable structure
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all the divs containing issue details and loop through them
        details = soup.find_all(class_="l-item-info")
        for detail in details:
            # Set a default type
            item_type = "item"

            # Look for the a tag with class "obj-reference content"
            item_id = detail.find(
                lambda tag: tag.name == "a"
                and tag.get("class") == ["obj-reference", "content"]
            )["href"].strip("/")

            # Look for a link to 'children', indicating it's a subcollection (or a book or issue with pages)
            has_children = detail.find(
                lambda tag: tag.name == "a" and tag.get("class") == ["obj-reference"]
            )

            # If it has children, harvest items from the subcollection
            if has_children and include_subcollections is True:
                item_type = "collection"
                items += harvest_collection_items(item_id, include_subcollections=True)

            # Save the item
            # The parent_id will enable us to identify items that are in subcollections
            items.append(
                {"item_id": item_id, "item_type": item_type, "parent_id": collection_id}
            )

        time.sleep(0.2)
        # Increment the startIdx
        start += n
        # Set n to the number of results on the current page
        n = len(details)
    return items


def create_rocrate(collection_id, dir_path, start_date, end_date):
    """
    Create an RO-Crate metadata file describing the downloaded dataset.
    """
    crate = ROCrate()
    crate.add_tree(dir_path)
    nb_path = ipynbname.path()
    nb = nbformat.read(nb_path, nbformat.NO_CONVERT)
    metadata = nb.metadata.rocrate
    nb_url = metadata.get("url", "")
    nb_properties = {
        "@type": ["File", "SoftwareSourceCode"],
        "name": metadata.get("name", ""),
        "description": metadata.get("description", ""),
        "encodingFormat": "application/x-ipynb+json",
        "codeRepository": metadata.get("codeRepository", ""),
        "url": nb_url,
    }
    crate.add(ContextEntity(crate, nb_url, properties=nb_properties))
    action_id = f"{nb_path.stem}_run"
    action_properties = {
        "@type": "CreateAction",
        "instrument": {"@id": nb_url},
        "actionStatus": {"@id": "http://schema.org/CompletedActionStatus"},
        "name": f"Run of notebook: {nb_path.name}",
        "result": {"@id": f"{dir_path.name}/"},
        "query": collection_id,
        "startDate": start_date,
        "endDate": end_date,
    }
    crate.add(ContextEntity(crate, action_id, properties=action_properties))
    for img in dir_path.glob("*.jpg"):
        encoding = mimetypes.guess_type(img)[0]
        stats = img.stat()
        size = stats.st_size
        date = datetime.fromtimestamp(stats.st_mtime).strftime("%Y-%m-%d")
        crate.update_jsonld(
            {
                "@id": f"images/{img.name}",
                "dateModified": date,
                "contentSize": size,
                "encodingFormat": encoding,
            }
        )
    crate.write(dir_path.parent)
    crate.write_zip(dir_path.parent)


def download_image(item_id, dir_path, not_available):
    file_path = Path(dir_path, f"{item_id}.jpg")
    if not file_path.exists():
        url = f"https://nla.gov.au/{item_id}/image"
        response = s.get(url, stream=True)

        # Exclude 404 responses and 'not available' images
        if response.ok and response.content != not_available:
            file_path.write_bytes(response.content)
            time.sleep(1)


def download_images(collection_id, create_crate=True):
    start_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Set up a directory to save the images to
    dir_path = Path("images", collection_id, "images")
    dir_path.mkdir(exist_ok=True, parents=True)

    # Load a 'not available' image to compare with what we download
    # If the bytes match then we won't save it
    not_available = Path("not_available.jpg").read_bytes()

    # Get the image identifiers
    items = harvest_collection_items(collection_id, include_subcollections=True)
    for item in tqdm(items):
        item_id = item["item_id"]
        if item["item_type"] == "item":
            download_image(item_id, dir_path, not_available)
        if item["item_type"] == "collection":
            # Sometimes items with children also have images that aren't included amongst the children!!
            # We need to look at the embedded metadata to check for copies
            metadata = get_work_data(item_id)
            if "copies" in metadata:
                download_image(item_id, dir_path, not_available)
    end_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    if create_crate is True:
        create_rocrate(collection_id, dir_path, start_date, end_date)
        display(
            HTML(
                f"Download dataset: <a href='images/{collection_id}.zip', download>images/{collection_id}.zip</a>"
            )
        )

download_images("nla.obj-2590820305")

# IGNORE THIS CELL -- TESTING ONLY

if os.getenv("GW_STATUS") == "dev":
    # ipynbname won't work in testing env, so don't create the crate
    download_images("nla.obj-2590820305", create_crate=False)

Download a collection of digitised images¶

Finding collections of images¶