import datetime
import json
import os
import re
import time
import warnings
from functools import reduce

warnings.simplefilter(action="ignore", category=FutureWarning)

import altair as alt
import pandas as pd
import requests_cache
from bs4 import BeautifulSoup
from IPython.display import FileLink, display
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
from dotenv import load_dotenv

s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

load_dotenv()

True

# This creates a variable called 'api_key', paste your key between the quotes
API_KEY = "YOUR API KEY"

# Use an api key value from environment variables if it is available (useful for testing)
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")

def get_total_results(params):
    """
    Get the total number of results for a search.
    """
    these_params = params.copy()
    these_params["n"] = 0
    response = s.get("https://api.trove.nla.gov.au/v3/result", params=these_params, headers={"X-API-KEY": API_KEY})
    data = response.json()
    return int(data["category"][0]["records"]["total"])


def get_fulltext_url(links):
    """
    Loop through the identifiers to find a link to the digital version of the journal.
    """
    url = None
    for link in links:
        if link["linktype"] == "fulltext" and "nla.obj" in link["value"]:
            url = link["value"]
            break
    return url


def get_copyright_status(response=None, url=None):
    """
    Scrape copyright information from a digital work page.
    """
    if url and not response:
        response = s.get(url)
    if response:
        soup = BeautifulSoup(response.text, "lxml")
        try:
            copyright_status = str(
                soup.find("div", id="tab-access").find("p", class_="decorative").string
            )
            return copyright_status
        # No access tab
        except AttributeError:
            pass
    return ""


def get_work_data(url):
    """
    Extract work data in a JSON string from the work's HTML page.
    """
    response = s.get(url)
    try:
        work_data = json.loads(
            re.search(
                r"var work = JSON\.parse\(JSON\.stringify\((\{.*\})", response.text
            ).group(1)
        )
    except (AttributeError, TypeError):
        work_data = {}
    # else:
    # If there's no copyright info in the work data, then scrape it
    # if "copyrightPolicy" not in work_data:
    #    work_data["copyrightPolicy"] = get_copyright_status(response)
    if not response.from_cache:
        time.sleep(0.2)
    return work_data


def find_field_content(record, tag, subfield):
    """
    Loop through a MARC record looking for tag/subfield.
    If found, return the subfield value.
    """
    try:
        for field in record["datafield"]:
            if field["tag"] == tag:
                if isinstance(field["subfield"], list):
                    for sfield in field["subfield"]:
                        if sfield["code"] == subfield:
                            return sfield["content"]
                else:
                    if field["subfield"]["code"] == subfield:
                        return field["subfield"]["content"]
    except (KeyError, TypeError):
        pass
    return ""


def get_marc_field(work_data, tag, subfield):
    """
    Loop through all the MARC records in work metadata looking for a tag/subfield.
    If found, return the subfield value.
    """
    if "marcData" in work_data and work_data["marcData"]:
        for record in work_data["marcData"]["record"]:
            content = find_field_content(record, tag, subfield)
            if content:
                return content
    return ""


def format_bytes(size):
    """
    Format bytes as a human-readable string
    """
    # 2**10 = 1024
    if not size:
        return "", ""
    power = 2**10
    n = 0
    power_labels = {0: "", 1: "K", 2: "M", 3: "G", 4: "T"}
    while size > power:
        size /= power
        n += 1
    return size, power_labels[n] + "B"


def get_publication_details(work_data):
    """
    Get MARC values for publication details and combine into a single string.
    """
    parts = []
    for code in ["a", "b", "c"]:
        value = get_marc_field(work_data, 260, code)
        if value:
            parts.append(str(value))
    return " ".join(parts)


def get_map_data(work_data):
    """
    Look for file size information in the embedded data
    """
    map_data = {
        "filesize_string": "",
        "filesize": 0,
        "width": 0,
        "height": 0,
        "copy_role": ""
    }
    width = None
    height = None
    num_bytes = None
    try:
        # Make sure there's a downloadable version
        if (
            work_data.get("accessConditions") == "Unrestricted"
            and "copies" in work_data
        ):
            for copy in work_data["copies"]:
                width = ""
                height = ""
                num_bytes = ""
                copy_role = ""
                # Get the pixel dimensions
                if "technicalmetadata" in copy:
                    width = copy["technicalmetadata"].get("width", 0)
                    height = copy["technicalmetadata"].get("height", 0)
                # Get filesize in bytes
                elif (
                    copy["copyrole"] in ["m", "o", "i", "fd"]
                    and copy["access"] == "true"
                ):
                    num_bytes = copy.get("filesize", 0)
                    copy_role = copy.get("copyrole", "")
            size, unit = format_bytes(num_bytes)
            # Convert bytes to something human friendly
            if size:
                map_data["filesize_string"] = "{:.2f}{}".format(size, unit)
            map_data["filesize"] = num_bytes
            map_data["width"] = width
            map_data["height"] = height
            map_data["copy_role"] = copy_role

    except AttributeError:
        pass
    return map_data


def get_maps():
    """
    Harvest metadata about maps.
    """
    url = "http://api.trove.nla.gov.au/v3/result"
    maps = []
    params = {
        "q": '"nla.obj-"',
        "category": "image",
        "l-artType": "map",
        "l-availability": "y",
        "l-format": "Map/Single map",
        "bulkHarvest": "true",  # Needed to maintain a consistent order across requests
        "n": 100,
        "encoding": "json",
    }
    start = "*"
    total = get_total_results(params)
    with tqdm(total=total) as pbar:
        while start:
            params["s"] = start
            response = s.get(url, params=params, headers={"X-API-KEY": API_KEY})
            data = response.json()
            # If there's a startNext value then we get it to request the next page of results
            try:
                start = data["category"][0]["records"]["nextStart"]
            except KeyError:
                start = None
            for work in tqdm(
                data["category"][0]["records"]["work"], leave=False
            ):
                # Check to see if there's a link to a digital version
                try:
                    fulltext_url = get_fulltext_url(work["identifier"])
                except KeyError:
                    pass
                else:
                    if fulltext_url:
                        work_data = get_work_data(fulltext_url)
                        map_data = get_map_data(work_data)
                        obj_id = re.search(r"(nla\.obj\-\d+)", fulltext_url).group(1)
                        # Get basic metadata
                        # You could add more work data here
                        # Check the Trove API docs for work record structure
                        map_data["title"] = work.get("title", "")
                        map_data["url"] = fulltext_url
                        map_data["work_url"] = work.get("troveUrl", "")
                        map_data["identifier"] = obj_id
                        map_data["date"] = work.get("issued", "")
                        map_data["creators"] = "|".join(work.get("contributor", []))
                        map_data["publication"] = get_publication_details(work_data)
                        map_data["extent"] = work_data.get("extent", "")
                        # I think the copyright status scraped from the page (below) is more likely to be accurate
                        # map_data["copyright_policy"] = work_data.get("copyrightPolicy")
                        map_data["copyright_status"] = get_copyright_status(
                            url=fulltext_url
                        )
                        map_data["scale"] = get_marc_field(work_data, 255, "a")
                        map_data["coordinates"] = get_marc_field(work_data, 255, "c")
                        maps.append(map_data)
                        # print(map_data)
            pbar.update(100)
    return maps

maps = get_maps()

# Convert to dataframe
# Convert dtypes converts numbers to integers rather than floats
df = pd.DataFrame(maps).convert_dtypes()

def merge_column(columns):
    values = []
    for value in columns:
        if isinstance(value, list):
            values += [str(v) for v in value if v]
        elif value:
            values.append(str(value))
    return " | ".join(sorted(set(values)))


def merge_records(df):
    # df["pages"].fillna(0, inplace=True)
    # df.fillna("", inplace=True)
    # df["pages"] = df["pages"].astype("Int64")

    # Add base dataset with columns that will always have only one value
    dfs = [df[["identifier", "url"]].drop_duplicates()]

    # Columns that potentially have multiple values which will be merged
    columns = [
        "title",
        "work_url",
        "date",
        "creators",
        "publication",
        "extent",
        "copyright_status",
        "scale",
        "coordinates",
        "filesize_string",
        "filesize",
        "width",
        "height",
        "copy_role"
    ]

    # Merge values from each column in turn, creating a new dataframe from each
    for column in columns:
        dfs.append(
            df.groupby(["identifier", "url"])[column].apply(merge_column).reset_index()
        )

    # Merge all the individual dataframes into one, linking on `text_file` value
    df_merged = reduce(
        lambda left, right: pd.merge(left, right, on=["identifier", "url"], how="left"), dfs
    )
    return df_merged

df_merged = merge_records(df)

# Reorder columns
df_merged = df_merged[
    [
        "identifier",
        "title",
        "url",
        "work_url",
        "date",
        "creators",
        "publication",
        "extent",
        "copyright_status",
        "scale",
        "coordinates",
        "filesize_string",
        "filesize",
        "width",
        "height",
        "copy_role",
    ]
]

# Save to CSV
csv_file = f"single_maps_{datetime.datetime.now().strftime('%Y%m%d')}.csv"
df_merged.to_csv(csv_file, index=False)
display(FileLink(csv_file))

# Reload data from CSV if necessary
df = pd.read_csv(
    "https://raw.githubusercontent.com/GLAM-Workbench/trove-maps-data/main/single_maps.csv"
)

print("{:,} maps".format(df.shape[0]))

35,042 maps

df.loc[df.duplicated(["url"], keep=False)].sort_values("url")

df.loc[df["filesize"].notnull()].shape

(30738, 16)

df["copy_role"].value_counts()

copy_role
m    30344
i      364
o       30
Name: count, dtype: Int64

size, unit = format_bytes(df["filesize"].sum())
print("{:.2f}{}".format(size, unit))

14.41TB

df["copyright_status"].value_counts()

copyright_status
Out of Copyright            25281
In Copyright                 8618
Edition Out of Copyright      631
Copyright Undetermined        349
Copyright Uncertain           110
Unknown                        22
Edition In Copyright            4
Name: count, dtype: Int64

counts = df["copyright_status"].value_counts().to_frame().reset_index()
counts.columns = ["status", "count"]
alt.Chart(counts).mark_bar().encode(
    y="status:N", x="count", tooltip="count"
).properties(height=200)

# Convert bytes to mb
df["mb"] = df["filesize"] / 2**10 / 2**10
# Create 500mb-sized bins and count the number of files in each bin
sizes = (
    pd.cut(df["mb"], bins=[0, 500, 1000, 1500, 2000, 3000, 3500])
    .value_counts()
    .to_frame()
    .reset_index()
)
sizes.columns = ["mb", "count"]
# Convert intervals to strings for display in chart
sizes["mb"] = sizes["mb"].astype(str)
sizes

alt.Chart(sizes).mark_bar().encode(
    x=alt.X("mb:N", sort=None), y="count:Q", tooltip="count:Q"
).properties(width=400)

df.iloc[df["filesize"].idxmax()]

identifier                                         nla.obj-2458846831
title               Geologic map of the Arabian Peninsula / compil...
url                             https://nla.gov.au/nla.obj-2458846831
work_url                       https://trove.nla.gov.au/work/12332257
date                                                             1963
creators                                     Geological Survey (U.S.)
publication                       Washington, D.C. : The Survey, 1963
extent                                   1 map : col. ; 113 x 132 cm.
copyright_status                                         In Copyright
scale                                             Scale 1:2,000,000 ;
coordinates                               (E 34°--E 61°/N 32°--N 12°)
filesize_string                                                3.64GB
filesize                                                   3907679404
width                                                           43211
height                                                          30144
copy_role                                                           m
mb                                                        3726.653484
Name: 1536, dtype: object

df.loc[(df["filesize"] / 2**10 / 2**10 / 2**10) > 3]

df.iloc[df["width"].idxmax()]

identifier                                          nla.obj-636346192
title               Land status petroleum mining agreement in resp...
url                               http://nla.gov.au/nla.obj-636346192
work_url                      https://trove.nla.gov.au/work/230363372
date                                                             1968
creators                               Brunei Shell Petroleum Company
publication                                                          
extent                                            1 map ; 286 x 58 cm
copyright_status                                         In Copyright
scale                                                  Scale 1:10,000
coordinates         (E 114°09ʹ53ʺ--E 114°23ʹ34ʺ/N 4°38ʹ42ʺ--N 4°32...
filesize_string                                                2.80GB
filesize                                                   3008938460
width                                                           68453
height                                                          14652
copy_role                                                           m
mb                                                        2869.547329
Name: 13749, dtype: object

df.iloc[df["height"].idxmax()]

identifier                                         nla.obj-2824964225
title               Traverse of the Ramu River, navigated by the "...
url                             https://nla.gov.au/nla.obj-2824964225
work_url                       https://trove.nla.gov.au/work/36757550
date                                                        1921-1945
creators                   Stanley, Evan R. (Evan Richard), 1885-1924
publication                                                          
extent                   1 map : on architectural linen ; 410 x 76 cm
copyright_status                                         In Copyright
scale                                                  Scale 1:31,760
coordinates                  (E 144°35'--E 144°50'/S 4°01'--S 5°11').
filesize_string                                                2.85GB
filesize                                                   3057135688
width                                                           13840
height                                                          73630
copy_role                                                           m
mb                                                        2915.511787
Name: 31282, dtype: object

	identifier	title	url	work_url	date	creators	publication	extent	copyright_status	scale	coordinates	filesize_string	filesize	width	height	copy_role	mb
2314	nla.obj-1059119069	[Western Australia gold mining leases]. Depart...	http://nla.gov.au/nla.obj-1059119069	https://trove.nla.gov.au/work/14409767	1905	Western Australia. Department of Mines	[South Kensington : Science Museum Library, 19...	1 map ; 65 x 99 cm.	Out of Copyright	Scale [ca. 1:31 680]	(E 122°10'30"/S 28°49'00")	1.19GB	1281565428	25062	17045	m	1222.196033
2315	nla.obj-1059119069	[Western Australia gold mining leases]. Depart...	http://nla.gov.au/nla.obj-1059119069	https://trove.nla.gov.au/work/14409773	1905	Western Australia. Department of Mines	[South Kensington : Science Museum Library, 19...	1 map ; 65 x 99 cm.	Out of Copyright	Scale [ca. 1:31 680]	(E 122°10'30"/S 28°49'00")	1.19GB	1281565428	25062	17045	m	1222.196033
2471	nla.obj-1059122984	[Western Australia gold mining leases]. (1.2.0...	http://nla.gov.au/nla.obj-1059122984	https://trove.nla.gov.au/work/14626171	1905	Western Australia. Department of Mines	[London : Science Museum Library, 1905?]	1 map ; 65 x 98 cm.	Edition Out of Copyright	Scale [ca. 1:15 840]	(E 121°09'/S 30°57')	1.14GB	1228526852	23731	17256	m	1171.614506
2467	nla.obj-1059122984	[Western Australia gold mining leases]. (1.2.0...	http://nla.gov.au/nla.obj-1059122984	https://trove.nla.gov.au/work/14626082	1905	Western Australia. Department of Mines	[London : Science Museum Library, 1905?]	1 map ; 65 x 98 cm.	Edition Out of Copyright	Scale [ca. 1:15 840]	(E 121°09'/S 30°57')	1.14GB	1228526852	23731	17256	m	1171.614506
2472	nla.obj-1059123632	[Western Australia gold mining leases]. 20.5.0...	http://nla.gov.au/nla.obj-1059123632	https://trove.nla.gov.au/work/14626181	1905	Western Australia. Department of Mines	[London : Science Museum Library, 1905?]	1 map ; 65 x 98 cm.	Edition Out of Copyright	Scale [ca. 1:15 840]	(E 121°09'/S 30°57')	1.20GB	1287582536	25084	17110	m	1227.934395
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4155	nla.obj-678124518	Geological atlas 1:50 000 series. Geological S...	https://nla.gov.au/nla.obj-678124518	https://trove.nla.gov.au/work/16946369	1987	Geological Survey of Tasmania	Hobart, Tas. : The Dept., 1987	1 map : col. ; 56 x 84 cm. on sheet 69 x 107 cm.	In Copyright	Scale 1:50 000	(E 148⁰00'--E 148⁰30'/S 41⁰15'--S 41⁰30')	1.16GB	1245749060	25436	16325	m	1188.038883
23268	nla.obj-893284845	Vegetation survey of Western Australia. mapped...	https://nla.gov.au/nla.obj-893284845	https://trove.nla.gov.au/work/32952780	1973	Beard, J. S. (John Stanley), 1916-2011	Perth : Vegmap Publications, 1973	1 map ; 47 x 59 cm., on sheet 59 x 70 cm. + 1 ...	In Copyright	Scale 1:250,000	(E 123°00ʹ--E 124°30ʹ/S 33°00ʹ--S 34°00ʹ).	748.65MB	785014388	18087	14467	m	748.64806
7405	nla.obj-893284845	Vegetation survey of Western Australia. mapped...	https://nla.gov.au/nla.obj-893284845	https://trove.nla.gov.au/work/19937156	1973	Beard, J. S. (John Stanley), 1916-2011	Perth : Vegmap Publications, 1973	1 map ; 47 x 59 cm., on sheet 59 x 70 cm. + 1 ...	In Copyright	Scale 1:250,000	(E 123°00ʹ--E 124°30ʹ/S 33°00ʹ--S 34°00ʹ).	748.65MB	785014388	18087	14467	m	748.64806
132	nla.obj-961623531	Australia 1:25 000 topographic survey. Produce...	https://nla.gov.au/nla.obj-961623531	https://trove.nla.gov.au/work/10384783	1979	Western Australia. Department of Lands and Sur...	Perth (W.A.) : Dept. of Lands and Surveys, 1979	1 map : col. ; 55 x 50 cm.	In Copyright	Scale 1:25,000	(E 115°45ʹ00ʺ--E 115°52ʹ30ʺ/S 32°15ʹ00ʺ--S 32°...	676.20MB	709045908	14078	16788	m	676.198872
3339	nla.obj-961623531	Australia 1:25 000 topographic survey.: Wellar...	https://nla.gov.au/nla.obj-961623531	https://trove.nla.gov.au/work/159335939	1891-1979	Western Australia. Department of Lands and Sur...	Perth (W.A.) : Dept. of Lands and Surveys, 1979	1 map : col. ; 55 x 50 cm.	In Copyright	Scale 1:25,000	(E 115°45ʹ00ʺ--E 115°52ʹ30ʺ/S 32°15ʹ00ʺ--S 32°...	676.20MB	709045908	14078	16788	m	676.198872

	mb	count
0	(0, 500]	16143
1	(500, 1000]	11454
2	(1000, 1500]	2733
3	(1500, 2000]	311
4	(2000, 3000]	84
5	(3000, 3500]	12

	identifier	title	url	work_url	date	creators	publication	extent	copyright_status	scale	coordinates	filesize_string	filesize	width	height	copy_role	mb
1536	nla.obj-2458846831	Geologic map of the Arabian Peninsula / compil...	https://nla.gov.au/nla.obj-2458846831	https://trove.nla.gov.au/work/12332257	1963	Geological Survey (U.S.)	Washington, D.C. : The Survey, 1963	1 map : col. ; 113 x 132 cm.	In Copyright	Scale 1:2,000,000 ;	(E 34°--E 61°/N 32°--N 12°)	3.64GB	3907679404	43211	30144	m	3726.653484
2611	nla.obj-2567709383	Map of the coastal plain of British Guiana	https://nla.gov.au/nla.obj-2567709383	https://trove.nla.gov.au/work/152215030	1955	Bleackley, D. (David)	[S.l.] : Geological Survey of British Guiana, ...	1 map : col. ; 88 x 205 cm.	In Copyright	Scale [ca. 1:143,000].	(W 60°00ʹ--W 57°00ʹ/N 9°00ʹ--N 6°00ʹ).	3.08GB	3305391052	49731	22155	m	3152.266552
5240	nla.obj-591001246	Map of the City of Rangoon and suburbs 1928-29...	http://nla.gov.au/nla.obj-591001246	https://trove.nla.gov.au/work/182743876	1932	Geological Survey of India		1 map on 4 sheets : colour ; 154 x 126 cm, sheets	Out of Copyright	Scale 1:12,000. 1 in. = 1000 ft.	(E 96°06ʹ--E 96°13ʹ/N 16°53ʹ--N 16°44ʹ).	3.38GB	3623879488	31769	38023	m	3456.000793
6241	nla.obj-3009772762	Shqipëria, hartë fiziko-politike : shkalla 1...	https://nla.gov.au/nla.obj-3009772762	https://trove.nla.gov.au/work/191812727	1965	Samimi, Ergjin		1 map on 3 sheets : color ; 173 x 91 cm, sheet...	In Copyright	Scale 1:200,000. 1 cm to 2 km ;	(E 18°58ʹ--E 21°12ʹ/N 42°40ʹ--N 39°35ʹ).	3.04GB	3266078212	23106	47117	m	3114.774906
7871	nla.obj-568387103	Peta geologi teknik daerah Jakarta - Bogor = E...	http://nla.gov.au/nla.obj-568387103	https://trove.nla.gov.au/work/20208553	1970	Indonesia. Direktorat Geologi		1 map : colour ; 157 x 107 cm	In Copyright	Scale 1:50,000	(E 106°33'00"--E 106°59'00"/S 5°59'00"--S 6°38...	3.05GB	3279210576	26384	41429	m	3127.298904
8362	nla.obj-400826638	Nyūginia-tō zenzu / Taiwan Sōtokufu Gaijibu...	http://nla.gov.au/nla.obj-400826638	https://trove.nla.gov.au/work/205481810	1942	Taiwan		1 map on 4 sheets : colour ; 172 x 99 cm	Out of Copyright	Scale 1:5,000,000 ;	(E 126°00ʹ--E 156°00ʹ/N 4°00ʹ--S 12°00ʹ).	3.04GB	3264456500	42659	25508	m	3113.228321
11917	nla.obj-568387099	Geological map of Djawa and Madura / compiled ...	http://nla.gov.au/nla.obj-568387099	https://trove.nla.gov.au/work/218208895	1963	Indonesia. Direktorat Geologi		1 map : colour ; 78 x 216 cm.	In Copyright	Scale 1:500,000	(E 104°58ʹ28ʺ--E 113°98ʹ28ʺ/S 5°30ʹ00ʺ--S 9°00...	3.08GB	3311801600	52593	20990	m	3158.380127
14886	nla.obj-1954049619	A new chart of the South Pacific Ocean, includ...	https://nla.gov.au/nla.obj-1954049619	https://trove.nla.gov.au/work/237421392	1849-1857	James Imray and Son		1 map ; 96.4 x 183.0 cm	Edition Out of Copyright	Scale approximately 1:11,000,000 at the equator	(E 111°--W 60°/N 20°--S 60°).	3.00GB	3223026784	44606	24085	m	3073.717865
15311	nla.obj-2618718155	Proposed plan for the site for the federal cap...	https://nla.gov.au/nla.obj-2618718155	https://trove.nla.gov.au/work/239126400	1911	Wilson, George, died 1923		1 map : colour ; 141 x 141 cm	Out of Copyright	Scale 1:4,800 ;	(E 149°08'/S 35°18').	3.12GB	3344969196	33600	33184	m	3190.011211
15477	nla.obj-2824965115	Map of the mandated territory of New Guinea / ...	https://nla.gov.au/nla.obj-2824965115	https://trove.nla.gov.au/work/239997009	1925	Krahe, R. E.		1 map : transparent architectural linen ; 210 ...	In Copyright	Scale 1:1,000,000	(E 140°50'00"--E 159°41'00"/S 0°33'00"--S 11°5...	3.37GB	3622362060	53028	22770	m	3454.553661
34455	nla.obj-230705067	Plan shewing pastoral leases and claims in the...	https://nla.gov.au/nla.obj-230705067	https://trove.nla.gov.au/work/8818311	1885-1950	South Australia. Surveyor-General's Office	Adelaide : Surveyor General's Office, 1885	1 map on 3 sheets ; 169.1 x 99.7 cm., on sheet...	Out of Copyright	Scale [1:1 000 000]. 16 miles to 1 inch.	<NA>	3.08GB	3308608288	25576	43121	m	3155.334747

Exploring digitised maps in Trove¶

Getting map images¶

Setting things up¶

You'll need a Trove API key to harvest the data.¶

Define some functions to do the work¶

Download map data¶

Convert to dataframe and save to CSV¶

Let's explore the results¶