import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)
import altair as alt
import pandas as pd
from wordcloud import WordCloud

# You will need to download the CSV file first from CloudStor or Zenodo
df = pd.read_csv("trove_tags_20240606.csv")

df["zone"].value_counts()

zone
newspaper     9537321
book           574038
gazette         98025
picture         96420
music           54840
article         24088
list             7717
map              7015
collection       4186
Name: count, dtype: int64

# Total tags across overlapping zones
df.loc[
    df["zone"].isin(["book", "article", "picture", "music", "map", "collection"])
].shape

(760587, 4)

df.loc[
    df["zone"].isin(["book", "article", "picture", "music", "map", "collection"])
].drop_duplicates(subset=["tag", "date", "record_id"]).shape

(700446, 4)

# Dedupe overlapping zones
deduped_works = df.loc[
    df["zone"].isin(["book", "article", "picture", "music", "map", "collection"])
].drop_duplicates(subset=["tag", "date", "record_id"])

# Non overlapping zones
other_zones = df.loc[df["zone"].isin(["newspaper", "gazette", "list"])]

# Combine the two to create a new deduped df
deduped = pd.concat([deduped_works, other_zones])

deduped.shape

(10343509, 4)

deduped["tag"].value_counts()[:50]

tag
north shore                          46393
lrrsa                                37671
illustration type cartoon            31980
tccc                                 29145
l1                                   25660
poem                                 24755
north sydney council                 23667
australian colonial music            23581
crossword puzzle                     21113
gag cartoon                          20625
political cartoon                    19788
melbourne football club              19224
crossword puzzle solution            19081
fiction                              16955
slvfix                               16860
corrected in full                    16732
tbd                                  15659
australian laureates                 14298
police court                         14037
rowing & sculling                    13986
serials                              12686
cammeray golf club                   12241
weather map                          12056
captain e t miles                    10745
advertising                          10622
second edition                       10460
firewood taxa3                       10436
cricket                              10326
illustration type photo              10242
horse destroyed                      10123
portrait (photo)                      9533
t a reynolds                          9517
firewood taxa                         9195
peanuts animation                     9152
short story                           8838
family notices                        8710
map                                   8545
cane                                  8129
ben bowyang animation                 7903
blondie animation                     7797
william tunks                         7637
phoenix foundry, ballarat             7510
locomotive                            7391
dora animation                        7338
serialised novel                      7310
b.c. animation                        7292
st. leonards school of arts           7183
cryptic crossword puzzle              7117
cryptic crossword puzzle solution     7018
nature notes                          6873
Name: count, dtype: int64

tag_counts = deduped["tag"].value_counts().to_frame().reset_index()
tag_counts.columns = ["tag", "count"]

tag_counts.to_csv("trove_tag_counts_20240606.csv", index=False)

# Get the top 200 tags
top_200 = tag_counts[:200].to_dict(orient="records")

# Reshape into a tag:count dictionary.
top_200 = {tag["tag"]: tag["count"] for tag in top_200}

WordCloud(width=1200, height=800).fit_words(top_200).to_image()

df.loc[df["zone"] == "picture"]["tag"].value_counts()[:20]

tag
c1                           3276
c3                           2396
sun pic                      1953
politicians                  1100
photos                        967
aviators and aviation         851
1931                          831
1932                          732
1930                          692
daily telegraph pic           690
1928                          671
ship passengers               631
australian colonial music     602
sydney harbour bridge         602
nsw mlas                      563
1927                          530
1925                          495
sydney harbour                493
building and construction     478
ships and shipping            476
Name: count, dtype: int64

# Convert date to datetime data type
df["date"] = pd.to_datetime(df["date"])

# Create a new column with the year
df["year"] = df["date"].dt.year

# Get counts of tags by year
year_counts = df.value_counts(["year", "zone"]).to_frame().reset_index()
year_counts.columns = ["year", "zone", "count"]

# Chart tags by year
alt.Chart(year_counts).mark_bar(size=25).encode(
    x=alt.X("year:Q", axis=alt.Axis(format="c")),
    y=alt.Y("count:Q", stack=True),
    color="zone:N",
    tooltip=["year:Q", "count:Q", "zone:N"],
).properties(width=700)

# This creates a column with the date of the first day of the month in which the tag was added
# We can use this to aggregate by month
df["year_month"] = (
    df["date"] + pd.offsets.MonthEnd(0) - pd.offsets.MonthBegin(normalize=True)
)

# Get tag counts by month
month_counts = df.value_counts(["year_month", "zone"]).to_frame().reset_index()
month_counts.columns = ["year_month", "zone", "count"]

alt.Chart(month_counts).mark_bar().encode(
    x="yearmonth(year_month):T",
    y="count:Q",
    color="zone:N",
    tooltip=["yearmonth(year_month):T", "count", "zone"],
).properties(width=700).interactive()

alt.Chart(
    month_counts.loc[month_counts["zone"].isin(["newspaper", "gazette"])]
).mark_bar().encode(
    x="yearmonth(year_month):T",
    y="count:Q",
    color="zone:N",
    tooltip=["yearmonth(year_month):T", "count", "zone"],
).properties(
    width=700
)

base = (
    alt.Chart(
        month_counts.loc[
            (month_counts["zone"].isin(["newspaper"]))
            & (month_counts["year_month"] < "2024-06-01")
        ]
    )
    .mark_point()
    .encode(
        x="yearmonth(year_month):T",
        y="count:Q",
        tooltip=["yearmonth(year_month):T", "count", "zone"],
    )
    .properties(width=700)
)

polynomial_fit = base.transform_regression(
    "year_month", "count", method="poly", order=5
).mark_line(color="red")


alt.layer(base, polynomial_fit)

Analyse public tags added to Trove¶

Tags by zone¶

How many duplicates across zones?¶

Top tags!¶

Tags on pictures¶

View tags by year¶

View tags by month¶

View tags by month in newspapers and gazettes¶