Save a collection of digitised images as an IIIF manifest¶
This notebook harvests metadata describing the contents of a digitised collection in Trove and saves it as an IIIF manifest. This makes it possible to work with data from Trove in a variety of IIIF-compliant tools.
The code below harvests data from Trove's digitised collection viewer and assembles it into an IIIF manifest. There are separate functions to generate manifests that comply with either version 2 or version 3 of the IIIF Presentation API. It works with any of Trove's digitised collections, including photographs, artworks, and manuscript finding aids.
All you need is the nla.obj
identifier of the collection you want to harvest. For example, to create a v3-compliant IIIF manifest from the finding aid of The Papers of Sir Edmund Barton (which has the identifier nla.obj-224441684
), you'd run:
create_manifest_v3("nla.obj-224441684")
Manifests are saved in the manifests
directory, using the nla.obj
identifier in the filename. So, the Barton papers manifest would be named: nla.obj-224441684-v3-manifest.json
.
Examples¶
import json
import os
import re
import time
from datetime import datetime, timedelta
from pathlib import Path
import requests_cache
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from iiif_prezi3 import Manifest, Range, config
from iiif_prezi.factory import ManifestFactory
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
s = requests_cache.CachedSession(expire_after=timedelta(days=30))
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))
load_dotenv()
# GETTING DATA FROM TROVE
def harvest_collection_items(collection_id, include_subcollections=False):
"""
Harvest all the items in a Trove collection (including any sub-collections)
by scraping the item identifiers from the 'Browse collection' pop-up.
See the Trove Data Guide:
"""
# The initial startIdx value
start = 0
# Number of results per page, used to increment the startIdx value
n = 20
items = []
# If there aren't 20 results on the page then we've reached the end, so continue harvesting until that happens.
while n == 20:
url = f"https://nla.gov.au/{collection_id}/browse?startIdx={start}&rows=20&op=c"
# Get the browse page
response = s.get(url)
# Beautifulsoup turns the HTML into an easily navigable structure
soup = BeautifulSoup(response.text, "html.parser")
# Find all the divs containing issue details and loop through them
details = soup.find_all(class_="l-item-info")
for detail in details:
# Set a default type
item_type = "item"
# Look for the a tag with class "obj-reference content"
item_id = detail.find(
lambda tag: tag.name == "a"
and tag.get("class") == ["obj-reference", "content"]
)["href"].strip("/")
# Look for a link to 'children', indicating it's a subcollection (or a book or issue with pages)
has_children = detail.find(
lambda tag: tag.name == "a" and tag.get("class") == ["obj-reference"]
)
# If it has children, harvest items from the subcollection
if has_children and include_subcollections is True:
item_type = "collection"
# items += harvest_collection_items(item_id, include_subcollections=True)
children = harvest_collection_items(
item_id, include_subcollections=True
)
else:
children = []
# Save the item
# The parent_id will enable us to identify items that are in subcollections
items.append(
{
"item_id": item_id,
"item_type": item_type,
"parent_id": collection_id,
"children": children,
}
)
if not response.from_cache:
time.sleep(0.2)
# Increment the startIdx
start += n
# Set n to the number of results on the current page
n = len(details)
return items
def prepare_url(url):
"""
Make sure nla.obj identifiers are properly formatted urls.
"""
url = re.sub(r"https?://nla/", "https://nla.gov.au/", url)
url = url.replace("\\\\", "//")
if not url.startswith("http"):
# print(url)
url = f"https://nla.gov.au/{url.strip('/')}"
return url
def get_work_data(url):
"""
Extract work data in a JSON string from the work's HTML page.
"""
url = prepare_url(url)
try:
response = s.get(url)
except ConnectionError:
print(url)
if response.ok:
try:
work_data = re.search(
r"var work = JSON\.parse\(JSON\.stringify\((\{.*\})", response.text
).group(1)
except AttributeError:
work_data = "{}"
else:
print(url)
work_data = "{}"
if not response.from_cache:
time.sleep(0.2)
return json.loads(work_data)
def get_page_title(url):
"""
Extract the value of the title tag from a HTML page.
"""
url = prepare_url(url)
try:
response = s.get(url)
except ConnectionError:
print(url)
if response.ok:
soup = BeautifulSoup(response.text)
return str(soup.title.string).strip()
return "No title"
# METADATA PROCESSING FOR BOTH V2 and V3
def clean_date(date):
"""
Extract the year from a date string.
"""
try:
year = re.search(r"\d{4}$", date).group(0)
except AttributeError:
year = ""
return year
def get_date_range(metadata):
"""
Get the year from start and end date fields (if they exist).
Save the values to a list.
"""
dates = [
clean_date(metadata.get("startDate", "")),
clean_date(metadata.get("endDate", "")),
]
return [d for d in dates if d]
def round_dimensions(dimensions, max=5000):
"""
Round dimensions to a given maximum.
Images in Trove normally have a limit of 5000 pixels on longest dimension,
but the dimensions in the metadata are often larger. This reduces the dimensions
in the metadata to relect the likely size of the downloadable image.
"""
height = dimensions["height"]
width = dimensions["width"]
if height <= max and width <= max:
return dimensions
if height > width:
ratio = max / height
height = max
width = round(width * ratio)
else:
ratio = max / width
width = max
height = round(height * ratio)
return {"height": height, "width": width}
def get_dimensions(item_metadata):
"""
Get the dimensions of the image access copy from item metadata.
"""
try:
dimensions = item_metadata["copies"][0]["technicalmetadata"]
dimensions = round_dimensions(dimensions)
except KeyError:
dimensions = {"height": 5000, "width": 5000}
return dimensions
def prepare_item_title(item_metadata, index, using_parent=False):
"""
Combine item title and other metadata to create a suitable title for a canvas.
If available, the title will include:
- subUnit info (eg Item 1001)
- page number
"""
title = item_metadata.get("title", "No title")
sub_unit = [item_metadata.get("subUnitType"), item_metadata.get("subUnitNo")]
sub_unit = [s for s in sub_unit if s]
sub_unit_str = " ".join([s for s in sub_unit if s])
if sub_unit_str and using_parent:
sub_unit_str = f"{sub_unit_str}, page {index}"
elif using_parent or item_metadata.get("form") == "Book":
sub_unit_str = f"page {index}"
if sub_unit_str:
title = f"{title} ({sub_unit_str})"
return title
def prepare_collection_title(item_metadata, item):
"""
Combine item title and other metadata to create a suitable title for a range.
If available, the title will include:
- subUnit info (eg Series 1)
- number of child ranges
- number of child canvases
"""
title = item_metadata.get("title", "No title")
sub_unit = [
item_metadata.get("subUnitType"),
item_metadata.get("subUnitNo"),
]
sub_unit_str = " ".join([s for s in sub_unit if s])
sc_num = len([c for c in item["children"] if c["item_type"] == "collection"])
page_num = len([c for c in item["children"] if c["item_type"] == "item"])
contents = []
if sub_unit_str:
contents.append(sub_unit_str)
if sc_num:
contents.append(f"{sc_num} items")
if page_num:
contents.append(f"{page_num} pages")
if contents:
title = f"{title} ({', '.join(contents)})"
return title
def prepare_metadata(item_metadata):
"""
Extract useful metadata from the item metadata and save to
a dict for addition to manifest.
"""
metadata = {}
fields = {
"type": "form",
"creator": "creator",
"publisher": "publisherName",
"extent": "extent",
"rights": "rights",
"call number": "holdingNumber",
}
for k, v in fields.items():
if val := item_metadata.get(v):
metadata[k] = val
dates = get_date_range(item_metadata)
if dates:
metadata["date"] = "-".join(dates)
return metadata
# CODE FOR IIIF PRESENTATION API V3
def add_metadata_v3(iiif_obj, item_metadata):
"""
Add metadata from the item to the IIIF object.
"""
metadata = prepare_metadata(item_metadata)
for k, v in metadata.items():
iiif_obj.add_metadata(k, v)
def add_item_to_manifest(manifest, item, index):
"""
Add an item as a canvas to the manifest.
"""
using_parent = False
item_id = item["item_id"]
item_url = f"https://nla.gov.au/{item_id}"
item_metadata = get_work_data(item_id)
# Presence of 'copies' indicates there's an image attached.
# This will include both items and 'collections' that have images
# attached that are not included amonst its children.
if "copies" in item_metadata:
dimensions = get_dimensions(item_metadata)
# If there's no title in the item metadata, it's
# probably the child of a subcollection.
# Use the parent metadata instead.
if not item_metadata.get("title"):
using_parent = True
try:
item_metadata = item["parent"]
except KeyError:
item_metadata = get_work_data(item["parent_id"])
title = prepare_item_title(item_metadata, index, using_parent)
# Create the canvas and add metadata
canvas = manifest.make_canvas(
id=item_url,
label=title,
)
canvas.set_hwd(height=dimensions["height"], width=dimensions["width"])
canvas.add_thumbnail(f"{item_url}-t", format="image/jpeg")
add_homepage(canvas, item_id)
add_metadata_v3(canvas, item_metadata)
# Add the image to the canvas
canvas.add_image(
image_url=f"{item_url}/image",
anno_page_id=f"{item_url}/page/",
anno_id=f"{item_url}/annotation/",
format="image/jpeg",
height=dimensions["height"],
width=dimensions["width"],
)
def add_items(manifest, items):
"""
Loop through collection items adding them as canvases to the manifest.
"""
for index, item in enumerate(items, 1):
add_item_to_manifest(manifest, item, index)
# Items can be nested in subcollections, so recurse back through children.
add_items(manifest, item["children"])
def add_range(item):
"""
Create a range from a subcollection.
"""
item_id = item["item_id"]
item_metadata = get_work_data(item_id)
# Some 'collections' have images attached that aren't amongst their children
# If there's a 'copies' attribute in the metadata, we'll add the item as one of it's own children
if "copies" in item_metadata:
item["children"].insert(0, {"item_id": item_id, "item_type": "item"})
title = prepare_collection_title(item_metadata, item)
rng = Range(id=f"https://nla.gov.au/{item_id}/range", label=title)
for child in item["children"]:
if child["item_type"] == "collection":
rng.add_item(add_range(child))
elif child["item_type"] == "item":
child_id = f"https://nla.gov.au/{child['item_id']}"
rng.add_item({"id": child_id})
return rng
def add_ranges(mf, items):
"""
Work through a list of collection items, adding ranges
for any subcollections.
"""
for item in items:
if item["item_type"] == "collection":
mf.add_range(add_range(item))
def add_homepage(iiif_obj, trove_id):
"""
Add a Trove url in a homepage record attached to the supplied IIIF object.
"""
homepage = {
"id": f"https://nla.gov.au/{trove_id}",
"type": "Text",
"label": {"en": ["View in Trove"]},
"format": "text/html",
}
iiif_obj.homepage = homepage
def create_manifest_v3(coll_id, repo=None):
"""
Build a manifest conforming to v3 of the IIIF Presentation API.
Harvests metadata from all the items within the specified collection,
and assembles them as a manifest, with each digitised image included as a canvas.
"""
coll_metadata = get_work_data(coll_id)
# If there's no metadata in the page (such as with a finding aid)
# get the page title.
if not coll_metadata:
coll_metadata = {"title": get_page_title(coll_id)}
config.configs["helpers.auto_fields.AutoLang"].auto_lang = "en"
# Construct the manifest and add metadata
if repo:
manifest_id = f"{repo.strip('/')}/{coll_id}-v3-manifest.json"
else:
manifest_id = (
f"https://glam-workbench.net/trove-images/{coll_id}-v3-manifest.json"
)
manifest = Manifest(id=manifest_id, label=coll_metadata.get("title", "No title"))
manifest.summary = f"This manifest was generated on {datetime.now().strftime('%d %b %Y')} by harvesting collection metadata from the Trove website."
add_homepage(manifest, coll_id)
add_metadata_v3(manifest, coll_metadata)
# Get items in this collection and construct item list and range structures.
items = harvest_collection_items(coll_id, include_subcollections=True)
add_items(manifest, items)
add_ranges(manifest, items)
# print(manifest.json(indent=2))
Path("manifests").mkdir(exist_ok=True)
Path("manifests", f"{coll_id}-v3-manifest.json").write_text(manifest.json())
# CODE FOR IIIF V2 PRESENTATION API
def add_metadata_v2(iiif_obj, item_metadata):
"""
Add metadata to an v2 IIIF object (manifest or canvas)
"""
metadata = prepare_metadata(item_metadata)
if metadata:
iiif_obj.set_metadata(metadata)
def add_item_to_seq(seq, item, index):
"""
Add a canvas to the sequence of canvases.
Obtains metadata for the given item, then uses this
to create a canvas.
"""
item_id = item["item_id"]
item_url = f"https://nla.gov.au/{item_id}"
item_metadata = get_work_data(item_id)
# Presence of 'copies' indicates there's an image attached.
# This will include both items and 'collections' that have images
# attached that are not included amonst its children.
if "copies" in item_metadata:
using_parent = False
dimensions = get_dimensions(item_metadata)
# If there's no title in the item metadata, it's
# probably the child of a subcollection.
# Use the parent metadata instead.
if not item_metadata.get("title"):
using_parent = True
try:
item_metadata = item["parent"]
except KeyError:
item_metadata = get_work_data(item["parent_id"])
# Build the Canvas and add metadata
title = prepare_item_title(item_metadata, index, using_parent)
canvas = seq.canvas(ident=item_url, label=title)
canvas.set_hw(dimensions["height"], dimensions["width"])
add_metadata_v2(canvas, item_metadata)
canvas.thumbnail = {"@id": f"{item_url}-t"}
# Add image info to the canvas
anno = canvas.annotation(ident=f"{item_url}/view")
img = anno.image(ident=f"{item_url}/image", iiif=False)
img.format = "image/jpeg"
img.set_hw(dimensions["height"], dimensions["width"])
return seq
def build_sequence(seq, items):
"""
Build a list or sequence of canvases which will be added to the manifest.
A canvas is generated for each item in the collection.
"""
for index, item in enumerate(items, 1):
seq = add_item_to_seq(seq, item, index)
# Recurse back through children to add nested canvases
build_sequence(seq, item["children"])
def build_structure(mf, items, toc=None):
"""
Build a Table of Contents structure listing subcollections (if any).
This consistents of a series of 'ranges'. Each range can contain more ranges, as
well as canvases. There's no a lot of documentation about this, so there might be easier
approaches, but it seems to work ok.
"""
for item in items:
item_id = item["item_id"]
if item["item_type"] == "collection":
# Get metadata from the sub-collection web page
item_metadata = get_work_data(item_id)
# Some 'collections' have images attached that aren't amongst their children
# If there's a 'copies' attribute in the metadata, we'll add the item as one of it's own children
if "copies" in item_metadata:
item["children"].insert(0, {"item_id": item_id, "item_type": "item"})
# Prepare a title for the range
title = prepare_collection_title(item_metadata, item)
# Create the range wihin the manifest
rng = mf.range(ident=f"https://nla.gov.au/{item_id}/range", label=title)
# The toc is a top-level range which will be displayed in navigation
if toc:
toc.add_range(rng)
# Loop through the children of this subcollection
for child in item["children"]:
child_id = f"https://nla.gov.au/{child['item_id']}"
# Add child ranges to this range
if child["item_type"] == "collection":
rng.add_range(child_id)
# Add canvases to this range
elif child["item_type"] == "item":
rng.add_canvas(child_id)
# Recurse back through child subcollections adding them all the the manifest
build_structure(mf, item["children"])
def create_manifest_v2(coll_id, repo=None):
"""
Build a manifest conforming to v2 of the IIIF Presentation API.
Harvests metadata from all the items within the specified collection,
and assembles them as a manifest, with each digitised image included as a canvas.
"""
coll_metadata = get_work_data(coll_id)
items = harvest_collection_items(coll_id, include_subcollections=True)
coll_url = f"https://nla.gov.au/{coll_id}"
fac = ManifestFactory()
if not coll_metadata:
coll_metadata = {"title": get_page_title(coll_id)}
if repo:
manifest_id = f"{repo.strip('/')}/{coll_id}-v2-manifest.json"
else:
manifest_id = (
f"https://glam-workbench.net/trove-images/{coll_id}-v2-manifest.json"
)
# Build the Manifest
mf = fac.manifest(ident=manifest_id, label=coll_metadata.get("title"))
add_metadata_v2(mf, coll_metadata)
mf.attribution = (
f"National Library of Australia (via Trove). See: https://nla.gov.au/{coll_id}"
)
mf.description = f"This manifest was generated on {datetime.now().strftime('%d %b %Y')} by harvesting collection metadata from the Trove website."
mf.related = {"@id": coll_url, "label": "View in Trove"}
# And walk through the pages
build_sequence(mf.sequence(label="Normal Order"), items)
# Create a top-level range to display the contents
toc = mf.range(ident=f"{coll_url}/toc", label="TOC")
toc.viewingHint = "top"
# Create ranges for sub-collections
build_structure(mf, items, toc)
js = mf.toString(compact=True)
# print(js)
Path("manifests").mkdir(exist_ok=True)
Path("manifests", f"{coll_id}-v2-manifest.json").write_text(js)
The cell below converts the collection of Postcard portraits of actresses, and of Australian towns, 1900s to a v3 IIIF manifest.
create_manifest_v3("nla.obj-140670968")
# IGNORE THIS CELL -- TESTING ONLY
if os.getenv("GW_STATUS") == "dev":
create_manifest_v3("nla.obj-140670968")
Created by Tim Sherratt for the GLAM Workbench.