Create a network graph visualisation of Australian government departments¶

This notebook visualises changes in Australian government departments over time, using data from Wikidata. It creates a hierarchically-ordered network graph where each agency is represented as a node whose position and colour is determined by the decade in which the agency was created. The size of the node indicates how long the agency was in existence, while edges between nodes connect agencies to their successors. Earliest agencies will be at the top of the graph.

You can view the query used to generate this graph using the Wikidata Query Service.

In [1]:
import json

import arrow
import pandas as pd
from IPython.display import IFrame, display
from pyvis.network import Network
from SPARQLWrapper import JSON, SPARQLWrapper
In [2]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

sparql.setQuery(
    """
SELECT
  ?item ?label
  ?id ?start_date ?end_date ?after_id
WHERE
{
  ?item wdt:P31 wd:Q57605562;
        wdt:P10856 ?id;
        wdt:P571 ?start_date;
        rdfs:label ?agency_label.
  OPTIONAL { ?item wdt:P576 ?end_date. }
  OPTIONAL { ?item wdt:P1366 ?after.
             ?after wdt:P10856 ?after_id. }
  FILTER (lang(?agency_label) = "en").
  # Combine start and end year into a single string, setting end date to "" if it doesn't exist
  BIND(concat(xsd:string(YEAR(?start_date)), "-", COALESCE(xsd:string(YEAR(?end_date)), "")) as ?date_range)
  # Combine dept name and date range into a single string
  BIND(concat(?agency_label, " (", ?date_range, ")") as ?label)
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""
)
In [3]:
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
df = pd.json_normalize(results["results"]["bindings"], sep="_")
In [4]:
df.head()
Out[4]:
item_type item_value after_id_type after_id_value id_type id_value start_date_datatype start_date_type start_date_value end_date_datatype end_date_type end_date_value label_type label_value
0 uri http://www.wikidata.org/entity/Q16956444 literal CA 1954 literal CA 1869 http://www.w3.org/2001/XMLSchema#dateTime literal 1975-04-21T00:00:00Z http://www.w3.org/2001/XMLSchema#dateTime literal 1975-12-22T00:00:00Z literal Department of the Environment (1975-1975)
1 uri http://www.wikidata.org/entity/Q16956444 literal CA 1957 literal CA 1869 http://www.w3.org/2001/XMLSchema#dateTime literal 1975-04-21T00:00:00Z http://www.w3.org/2001/XMLSchema#dateTime literal 1975-12-22T00:00:00Z literal Department of the Environment (1975-1975)
2 uri http://www.wikidata.org/entity/Q16956449 literal CA 1476 literal CA 1407 http://www.w3.org/2001/XMLSchema#dateTime literal 1971-05-31T00:00:00Z http://www.w3.org/2001/XMLSchema#dateTime literal 1972-12-19T00:00:00Z literal Department of the Environment, Aborigines and ...
3 uri http://www.wikidata.org/entity/Q16956449 literal CA 1479 literal CA 1407 http://www.w3.org/2001/XMLSchema#dateTime literal 1971-05-31T00:00:00Z http://www.w3.org/2001/XMLSchema#dateTime literal 1972-12-19T00:00:00Z literal Department of the Environment, Aborigines and ...
4 uri http://www.wikidata.org/entity/Q16956449 literal CA 1486 literal CA 1407 http://www.w3.org/2001/XMLSchema#dateTime literal 1971-05-31T00:00:00Z http://www.w3.org/2001/XMLSchema#dateTime literal 1972-12-19T00:00:00Z literal Department of the Environment, Aborigines and ...
In [5]:
# Tableau style colours from http://tableaufriction.blogspot.com/2012/11/finally-you-can-use-tableau-data-colors.html
rgb = [
    "255.187.120",
    "255.127.14",
    "174.199.232",
    "44.160.44",
    "31.119.180",
    "255.152.150",
    "214.39.40",
    "197.176.213",
    "152.223.138",
    "148.103.189",
    "247.182.210",
    "227.119.194",
    "196.156.148",
    "140.86.75",
    "127.127.127",
    "219.219.141",
    "199.199.199",
    "188.189.34",
    "158.218.229",
    "23.190.207",
]


def make_darker(colour, factor=0.75):
    """
    Darken colour by given factor.
    """
    return [str(round(int(c) * factor)) for c in colour]


def make_lighter(colour, factor=0.75):
    """
    Lighten colour by given factor.
    """
    return [str(round((255 - int(c)) * factor) + int(c)) for c in colour]


# List of Tableau style colours
colours = [f'rgb({",".join(r.split("."))})' for r in rgb]
# List of darkened colors
borders = [f'rgb({",".join(make_darker(r.split(".")))})' for r in rgb]
# List of lightened colours
highlights = [f'rgb({",".join(make_lighter(r.split(".")))})' for r in rgb]

# Create groups for each decade in the date range, assigning a different colour for each group
decades = [str(d) for d in range(190, 203)]
decade_groups = {
    d: {
        "color": {
            "background": colours[i],
            "border": borders[i],
            "highlight": {"background": highlights[i], "border": borders[i]},
        }
    }
    for i, d in enumerate(decades)
}

# Calculate the possible range of values for the length of an agency's existence
max_days = (arrow.utcnow() - arrow.get("1901-01-01")).days
min_days = 1
current_range = max_days - min_days


def calculate_size(start, end, current_range=current_range, biggest=150, smallest=30):
    """
    Calculate the size of nodes based on each agency's length of existence.
    Adjust value to fall with the desired range.
    See: https://stackoverflow.com/a/929107
    """
    start_date = arrow.get(start)
    try:
        end_date = arrow.get(end)
    except (ValueError, TypeError):
        end_date = arrow.utcnow()
    delta = end_date - start_date
    return (((delta.days - 1) * (biggest - smallest)) / current_range) + 20
In [6]:
net = Network(notebook=True, cdn_resources="remote")
In [7]:
# Loop through the agency data, creating a node for each agency
for agency in df.itertuples():
    net.add_node(
        agency.id_value,
        label=agency.id_value,
        # Include a hyperlink to the agency record in RecordSearch
        title=f"<a target='_blank' href='https://recordsearch.naa.gov.au/scripts/AutoSearch.asp?Number={agency.id_value}'>{agency.id_value}, {agency.label_value}</a>",
        # Assign to a group based on the decade in which it was created
        # This will colour the nodes by decade
        group=agency.start_date_value[:3],
        # Assign a level based on decade in which it was created
        # This will help to position the agency hierarchically by creation date
        level=int(agency.start_date_value[:4]),
        # Size the node according the length of time the agency existed
        size=calculate_size(agency.start_date_value, agency.end_date_value),
    )
In [8]:
# Add edges between a node and its successors
for agency in df.dropna(subset=["after_id_value"]).itertuples():
    net.add_edge(agency.id_value, agency.after_id_value)
In [9]:
# Network graph configuration
# It's easier to manange this in a Python dict then convert to JSON for PyVis
options = {
    "configure": {"enabled": False},
    "layout": {
        # A hierarchical layout with levels based on start date will order the agencies by time
        "hierarchical": {
            "enabled": True,
            "sortMethod": "directed",
            "shakeTowards": "leaves",
            "nodeSpacing": 20,
            "levelSeparation": 40,
            "treeSpacing": 20,
        }
    },
    "physics": {"hierarchicalRepulsion": {"avoidOverlap": 1, "nodeDistance": 180}},
    "nodes": {"font": {"size": 15}},
    # Assign colours by decade
    "groups": decade_groups,
    "edges": {
        "arrows": {"to": {"enabled": True, "scaleFactor": 0.5}},
        "arrowStrikethrough": False,
        "smooth": {"enabled": True},
        "color": {"color": "#b0bec5", "inherit": True},
    },
}
In [10]:
net.set_options(f"var options = {json.dumps(options)}")
# Doing this rather than net.show() gives better results and predicatble sizes
net.write_html("agencies-network.html", notebook=True)
display(IFrame("agencies-network.html", height=800, width="100%"))

Created by Tim Sherratt for the GLAM Workbench.

The development of the Wikidata section of the GLAM Workbench was supported by Wikimedia Australia.