Parse map coordinates from metadata¶
The harvest of digitised maps metadata includes a coordinates
column that provides a string representation of either a point or a bounding box. This notebook attempts to parse the coordinate string and convert the values to decimals. It then uses the decimal values to explore the geographical context of Trove's digitised map collection.
The coordinate strings are either:
- Points in the format
(Longitude/Latitude)
, for example: '(E 145°33ʹ/S 37°42ʹ)'. - Bounding boxes in the format
(W--E/N--S)
, for example: '(E 114°00ʹ00ʺ--E 130°00ʹ00ʺ/S 14°00ʹ00ʺ--S 34°00ʹ00ʺ)'.
I'm using lat_lon_parser to convert degrees/minutes/seconds to decimal values.
import logging
from operator import itemgetter
import datetime
import altair as alt
import folium
import ipywidgets as widgets
import pandas as pd
from folium.plugins import FastMarkerCluster
from ipyleaflet import ImageOverlay, Map, WidgetControl
from lat_lon_parser import parse
from vega_datasets import data as vega_data
# Save the parsing errors in a log file
logging.basicConfig(
filename="parse_errors.log", level=logging.DEBUG, format="%(message)s"
)
def check_coord(value, lat_lon):
"""
Make sure that lat/longs are within expected range.
Drop values if outside range.
"""
if lat_lon == "lat" and abs(value) <= 90:
return value
elif lat_lon == "lon" and abs(value) <= 180:
return value
else:
raise ValueError
return None
def check_dimensions(lats, longs):
"""
Try to fix some obvious errors - eg: E 175--W 176 instead of E 175--E 176.
Basically this looks for very long and skinny boxes and changes negative values to positive
on the long side.
I think the settings are fairly conservative, but you might need to adjust the values.
"""
long_diff = longs[1] - longs[0]
lat_diff = lats[1] - lats[0]
if long_diff >= 200 and lat_diff <= 8 and lat_diff > 0:
# print(longs, lats)
longs[0] = abs(longs[0])
if lat_diff >= 50 and long_diff <= 2 and long_diff > 0:
# print(lats, longs)
lats[0] = abs(lats[0])
return sorted(lats), sorted(longs)
def get_center(parsed):
"""
Get the centre of a bounding box.
Returns point coords.
See: https://gis.stackexchange.com/a/394860
"""
e, w, n, s = itemgetter("east", "west", "north", "south")(parsed)
width = max(w, e) - min(w, e)
# get the box height
height = max(s, n) - min(s, n)
# compute the center
center = check_coord(round(min(s, n) + height / 2, 4), "lat"), check_coord(
round(min(w, e) + width / 2, 4), "lon"
)
return center
def parse_value(value):
"""
Parse latitude or longitude values.
"""
values = value.split("--")
# Sometimes single hyphens are used
if len(values) == 1:
values = value.split("-")
coords = [parse(v.strip()) for v in values]
return sorted(coords)
def parse_coords(coords):
"""
Parses a coordinate string, converting values to decimal.
For points -- returns latitude and longitude.
For boxes -- returns centre of box as latitude, longitude, and bounds as east, west, north, and south.
"""
parsed = {}
# Default values
for c in ["east", "west", "north", "south", "latitude", "longitude"]:
parsed[c] = None
try:
# Split string into lat and long using /
long, lat = coords.split("/")
if long.startswith("N"):
long, lat = lat, long
longs = parse_value(long)
lats = parse_value(lat)
except (ValueError, TypeError):
logging.error(coords)
else:
try:
# Bounding box
if len(longs) == 2 and len(lats) == 2:
# This tries to fix some obvious errors
# Comment out if you don't want this
lats, longs = check_dimensions(lats, longs)
parsed["east"] = check_coord(longs[-1], "lon")
parsed["west"] = check_coord(longs[0], "lon")
parsed["north"] = check_coord(lats[-1], "lat")
parsed["south"] = check_coord(lats[0], "lat")
# Get centre of bounding box
latitude, longitude = get_center(parsed)
parsed["latitude"] = latitude
parsed["longitude"] = longitude
# Point
elif len(longs) == 1 and len(lats) == 1:
parsed["latitude"] = check_coord(lats[0], "lat")
parsed["longitude"] = check_coord(longs[0], "lon")
except ValueError:
logging.error(coords)
return parsed
def get_coords(row):
"""
Process a row of the dataset, converting coordinate string into decimal values.
"""
coords = (
str(row["coordinates"]).strip(".").strip("(").strip(")").strip("[").strip("]")
)
return parse_coords(coords)
Load the harvested data.
df = pd.read_csv(
"https://raw.githubusercontent.com/GLAM-Workbench/trove-maps-data/main/single_maps.csv"
)
How many digitised maps have coordinate values?
df.drop
df.loc[df["coordinates"].notnull()].shape[0]
28778
Parse the coordinate strings.
# Extract a subset of the harvested data
df_coords = df.loc[df["coordinates"].notnull()][["title", "url", "coordinates"]].copy()
# Parse the coordinate values and save the results to new columns
df_coords[["east", "west", "north", "south", "latitude", "longitude"]] = df_coords.loc[
df_coords["coordinates"].notnull()
].apply(get_coords, axis=1, result_type="expand")
Let's have a peek at the parsed data.
df_coords.head()
title | url | coordinates | east | west | north | south | latitude | longitude | |
---|---|---|---|---|---|---|---|---|---|
1 | Ayers, from 5 to 20 m. S.E. b. S. of Fort Poin... | http://nla.gov.au/nla.obj-232162256 | (E 130⁰50'--E 131⁰00'/S 12⁰30'--S 12⁰40') | 131.000000 | 130.833333 | -12.500000 | -12.666667 | -12.5833 | 130.9167 |
3 | Bagot, from Fort Point to 20 m. E.N.E. / drawn... | http://nla.gov.au/nla.obj-232162365 | (E 130⁰40'--E 131⁰05'/S 12⁰20'--S 12⁰30') | 131.083333 | 130.666667 | -12.333333 | -12.500000 | -12.4167 | 130.8750 |
5 | Bundey, N.T. / Frazer S. Crawford, photo-litho... | http://nla.gov.au/nla.obj-232164150 | (E 131⁰20'--E 131⁰25'/S 12⁰30'--S 12⁰40') | 131.416667 | 131.333333 | -12.500000 | -12.666667 | -12.5833 | 131.3750 |
7 | Cavenagh, from 18 to 32 m. S.S.E. of Fort Poin... | http://nla.gov.au/nla.obj-232162631 | (E 130⁰50'--E 131⁰05'/S 12⁰40'--S 12⁰55') | 131.083333 | 130.833333 | -12.666667 | -12.916667 | -12.7917 | 130.9583 |
8 | Colton, from 25 to 40 m. S.E. of Fort Point / ... | http://nla.gov.au/nla.obj-232162851 | (E 131⁰05'--E 131⁰20'/S 12⁰40'--S 12⁰55') | 131.333333 | 131.083333 | -12.666667 | -12.916667 | -12.7917 | 131.2083 |
How many of the coordinate strings could be successfuly parsed as decimal values?
df_coords.loc[
(df_coords["latitude"].notnull()) & (df_coords["longitude"].notnull())
].shape[0]
28205
Save the parsed coordinates.
df_coords.to_csv(f"single_maps_{datetime.datetime.now().strftime('%Y%m%d')}_coordinates.csv", index=False)
Display the geographical distribution of the digitised maps¶
To visualise the locations of the maps we can plot the centre points using Altair. Mouseover the points to view the map titles.
# This loads the country boundaries data
countries = alt.topo_feature(vega_data.world_110m.url, feature="countries")
# First we'll create the world map using the boundaries
background = (
alt.Chart(countries)
.mark_geoshape(fill="lightgray", stroke="white")
.project("equirectangular")
.properties(width=900, height=500)
)
# Then we'll plot the positions of places using circles
points = (
# By loading the data from url we stop the notebook from getting bloated
alt.Chart(
"https://raw.githubusercontent.com/GLAM-Workbench/trove-maps-data/main/single_maps_coordinates.csv"
)
.mark_circle(
# Style the circles
size=10,
color="steelblue",
opacity=0.4,
)
.encode(
# Provide the coordinates
longitude="longitude:Q",
latitude="latitude:Q",
# More info on hover
tooltip=["title:N", "url:N"],
)
.properties(width=900, height=500)
)
# Finally we layer the plotted points on top of the backgroup map
alt.layer(background, points)
The obvious 'crosshairs' at the 0,0 point suggest that some of the coordinates might be missing data. If you explore the points you'll probably find some that seem to be in the wrong position, perhaps because the latitudes or longitudes have been flipped. More checking would be needed if you were using this dataset for detailed analysis.
We can also use Folium to plot the map locations, creating a marker for each point and grouping them into clusters. Folium's FastMarkerCluster plugin lets us add thousands of markers without slowing things down.
m = folium.Map(location=(0, 0), zoom_start=2)
callback = (
"function (row) {"
'var marker = L.marker(new L.LatLng(row[0], row[1]), {color: "blue", tooltip: row[2]});'
"var popup = L.popup({maxWidth: '300'});"
"var icon = L.AwesomeMarkers.icon({"
"icon: 'globe',"
"iconColor: 'white',"
"markerColor: 'blue',"
"prefix: 'glyphicon',"
"extraClasses: 'fa-rotate-0'"
"});"
"marker.setIcon(icon);"
"const title = row[2];"
"const url = row[3];"
"var mytext = $(`<div><p>${title}</p><a target='blank' href='${url}'>${url}</a></div>`)[0];"
"popup.setContent(mytext);"
"marker.bindPopup(popup);"
"return marker};"
)
m.add_child(
FastMarkerCluster(
df_coords.loc[
(df_coords["latitude"].notnull()) & (df_coords["longitude"].notnull())
][["latitude", "longitude", "title", "url"]],
callback=callback,
)
)
m
#m.save("trove-map-clusters.html")
I've also saved the Folium map as an HTML file for easy browsing.
Display map image on a modern basemap¶
To view a single map in context, we can use IPyLeaflet to layer the map image on top of a modern basemap, using the bounding box data to position the image. We'll also add a slider to change the opacity of the map image, so we can compare it to the basemap underneath.
The results will, of course, depend on both the accuracy of the coordinates and the nature of the map. For more accurate results we'd want to use something like MapWarper to georectify the map image. Note too that if a map image has a lot of white space or text around the map itself, the proportions of the image might not match the bounds of the map. This will result in the image being distorted.
# Select a random record with a bounding box
random = df_coords.loc[df_coords["east"].notnull()].sample(n=1).iloc[0]
default_layout = widgets.Layout(width="900px", height="540px")
# Create the basemap and centre it on the map location
m = Map(
center=(random["latitude"], random["longitude"]),
zoom=8,
scroll_wheel_zoom=True,
layout=default_layout,
)
# Add the map image as an overlay
image = ImageOverlay(
# Image url
url=f"{random['url']}/image",
# Postion the image using the bounding box data
bounds=((random["south"], random["west"]), (random["north"], random["east"])),
)
# Add a slider to control the opacity of the image
slider = widgets.FloatSlider(
min=0,
max=1,
value=1, # Opacity is valid in [0,1] range
orientation="vertical", # Vertical slider is what we want
readout=False, # No need to show exact value
layout=widgets.Layout(width="2em"),
)
# Connect slider value to opacity property of the Image Layer
widgets.jslink((slider, "value"), (image, "opacity"))
m.add_control(WidgetControl(widget=slider))
m.add_layer(image)
print(random["title"])
print(random["url"])
# Set the map zoom so you can see all of the image.
m.fit_bounds([[random["south"], random["west"]], [random["north"], random["east"]]])
m
Septentrionalium terrarum descriptio / per Gerardum Mercatorem cum privilegio http://nla.gov.au/nla.obj-231198671
Map(center=[75.0, 0.0], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom_out_…
Created by Tim Sherratt for the GLAM Workbench.