import json
import re
from pathlib import Path

import altair as alt
import arrow
import pandas as pd
import requests_cache
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from surt import surt

s = requests_cache.CachedSession("archived_titles")
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

# The code in this cell is copied from notebooks in the Web Archives section of the GLAM Workbench (https://glam-workbench.net/web-archives/)
# In particular see: https://glam-workbench.net/web-archives/#find-all-the-archived-versions-of-a-web-page

# These are the repositories we'll be using
TIMEGATES = {
    "awa": "https://web.archive.org.au/awa/",
    "nzwa": "https://ndhadeliver.natlib.govt.nz/webarchive/wayback/",
    "ukwa": "https://www.webarchive.org.uk/wayback/en/archive/",
    "ia": "https://web.archive.org/web/",
}


def convert_lists_to_dicts(results):
    """
    Converts IA style timemap (a JSON array of arrays) to a list of dictionaries.
    Renames keys to standardise IA with other Timemaps.
    """
    if results:
        keys = results[0]
        results_as_dicts = [dict(zip(keys, v)) for v in results[1:]]
    else:
        results_as_dicts = results
    for d in results_as_dicts:
        d["status"] = d.pop("statuscode")
        d["mime"] = d.pop("mimetype")
        d["url"] = d.pop("original")
    return results_as_dicts


def get_capture_data_from_memento(url, request_type="head"):
    """
    For OpenWayback systems this can get some extra capture info to insert into Timemaps.
    """
    if request_type == "head":
        response = s.head(url)
    else:
        response = s.get(url)
    headers = response.headers
    length = headers.get("x-archive-orig-content-length")
    status = headers.get("x-archive-orig-status")
    status = status.split(" ")[0] if status else None
    mime = headers.get("x-archive-orig-content-type")
    mime = mime.split(";")[0] if mime else None
    return {"length": length, "status": status, "mime": mime}


def convert_link_to_json(results, enrich_data=False):
    """
    Converts link formatted Timemap to JSON.
    """
    data = []
    for line in results.splitlines():
        parts = line.split("; ")
        if len(parts) > 1:
            link_type = re.search(
                r'rel="(original|self|timegate|first memento|last memento|memento)"',
                parts[1],
            ).group(1)
            if link_type == "memento":
                link = parts[0].strip("<>")
                timestamp, original = re.search(r"/(\d{14})/(.*)$", link).groups()
                capture = {
                    "urlkey": surt(original),
                    "timestamp": timestamp,
                    "url": original,
                }
                if enrich_data:
                    capture.update(get_capture_data_from_memento(link))
                    print(capture)
                data.append(capture)
    return data


def get_timemap_as_json(timegate, url, enrich_data=False):
    """
    Get a Timemap then normalise results (if necessary) to return a list of dicts.
    """
    tg_url = f"{TIMEGATES[timegate]}timemap/json/{url}/"
    response = s.get(tg_url)
    response_type = response.headers["content-type"]
    if response_type == "text/x-ndjson":
        data = [json.loads(line) for line in response.text.splitlines()]
    elif response_type == "application/json":
        data = convert_lists_to_dicts(response.json())
    elif response_type in ["application/link-format", "text/html;charset=utf-8"]:
        data = convert_link_to_json(response.text, enrich_data=enrich_data)
    return data

titles = []

# These are the pages that listed available titles.
# There was a change in 2016
pages = [
    {"url": "http://trove.nla.gov.au/ndp/del/titles", "path": "/ndp/del/title/"},
    {"url": "https://trove.nla.gov.au/newspaper/about", "path": "/newspaper/title/"},
]

for page in pages:
    for capture in get_timemap_as_json("ia", page["url"]):
        if capture["status"] == "200":
            url = f'https://web.archive.org/web/{capture["timestamp"]}id_/{capture["url"]}'
            # print(url)
            capture_date = arrow.get(capture["timestamp"][:8], "YYYYMMDD").format(
                "YYYY-MM-DD"
            )
            # print(capture_date)
            response = s.get(url)
            soup = BeautifulSoup(response.content)
            title_links = soup.find_all("a", href=re.compile(page["path"]))
            for title in title_links:
                # Get the title text
                full_title = title.get_text().strip()

                # Get the title id
                title_id = re.search(r"\/(\d+)\/?$", title["href"]).group(1)

                # Most of the code below is aimed at normalising the publication place and dates values to allow for easy grouping & deduplication
                brief_title = re.sub(r"\(.+\)\s*$", "", full_title).strip()
                try:
                    details = re.search(r"\((.+)\)\s*$", full_title).group(1).split(":")
                except AttributeError:
                    place = ""
                    dates = ""
                else:
                    try:
                        place = details[0].strip()
                        # Normalise states
                        try:
                            place = re.sub(
                                r"(, )?([A-Za-z]+)[\.\s]*$",
                                lambda match: f'{match.group(1) if match.group(1) else ""}{match.group(2).upper()}',
                                place,
                            )
                        except AttributeError:
                            pass
                        # Normalise dates
                        dates = " - ".join(
                            [d.strip() for d in details[1].strip().split("-")]
                        )
                    except IndexError:
                        place = ""
                        dates = " - ".join(
                            [d.strip() for d in details[0].strip().split("-")]
                        )
                titles.append(
                    {
                        "title_id": title_id,
                        "full_title": full_title,
                        "title": brief_title,
                        "place": place,
                        "dates": dates,
                        "capture_date": capture_date,
                        "capture_timestamp": capture["timestamp"],
                    }
                )

df = pd.DataFrame(titles)

df

# Number of captures
len(df["capture_timestamp"].unique())

130

# Number of days on which the pages were captured
len(df["capture_date"].unique())

120

df.to_csv("trove_newspaper_titles_2009_2021.csv", index=False)

# Drop duplicates in cases where there were mutiple captures on a single day
captures_df = df.drop_duplicates(subset=["capture_date", "full_title"])

# Calculate totals per capture
capture_totals = captures_df["capture_date"].value_counts().to_frame().reset_index()
capture_totals.columns = ["capture_date", "total"]
capture_totals

alt.Chart(capture_totals).mark_line(point=True).encode(
    x=alt.X("capture_date:T", title="Date captured"),
    y=alt.Y("total:Q", title="Number of newspaper titles"),
    tooltip=[alt.Tooltip("capture_date:T", format="%e %b %Y"), "total:Q"],
).properties(width=700)

first_appearance = df.drop_duplicates(subset=["title", "place", "dates"])

first_appearance

first_appearance.loc[first_appearance["title"] == "Canberra Times"]

with Path("titles_list.md").open("w") as titles_list:
    for title, group in first_appearance.groupby(["title", "title_id"]):
        places = " | ".join(group["place"].unique())
        titles_list.write(
            f'<h4><a href="http://nla.gov.au/nla.news-title{title[1]}">{title[0]} ({places})</a></h4>'
        )
        titles_list.write(
            group.sort_values(by="capture_date")[
                ["capture_date", "dates", "place"]
            ].to_html(index=False)
        )

first_appearance.to_csv(
    "trove_newspaper_titles_first_appearance_2009_2021.csv", index=False
)

	title_id	full_title	title	place	dates	capture_date	capture_timestamp
0	34	Advertiser (Adelaide, SA : 1889-1931)	Advertiser	Adelaide, SA	1889 - 1931	2009-11-12	20091112000713
1	13	Argus (Melbourne, Vic. : 1848-1954)	Argus	Melbourne, VIC	1848 - 1954	2009-11-12	20091112000713
2	16	Brisbane Courier (Qld. : 1864-1933)	Brisbane Courier	QLD	1864 - 1933	2009-11-12	20091112000713
3	11	Canberra Times (ACT : 1926-1954)	Canberra Times	ACT	1926 - 1954	2009-11-12	20091112000713
4	24	Colonial Times (Hobart, Tas. : 1828-1857)	Colonial Times	Hobart, TAS	1828 - 1857	2009-11-12	20091112000713
...	...	...	...	...	...	...	...
107017	1331	South Australian Record and Australasian and S...	South Australian Record and Australasian and S...	London, ENGLAND	1840 - 1841	2022-01-16	20220116142742
107018	1369	Territory of Papua Government Gazette (Papua N...	Territory of Papua Government Gazette	Papua New GUINEA	1906 - 1942	2022-01-16	20220116142742
107019	1371	Territory of Papua and New Guinea Government G...	Territory of Papua and New Guinea Government G...		1949 - 1971	2022-01-16	20220116142742
107020	1370	Territory of Papua-New Guinea Government Gazet...	Territory of Papua-New Guinea Government Gazette		1945 - 1949	2022-01-16	20220116142742
107021	1391	Tribune (Philippines : 1932 - 1945)	Tribune	PHILIPPINES	1932 - 1945	2022-01-16	20220116142742

	capture_date	total
0	2022-01-16	1700
1	2022-01-10	1700
2	2021-12-13	1697
3	2021-11-20	1690
4	2021-11-16	1690
...	...	...
115	2010-05-01	37
116	2009-11-24	34
117	2009-11-22	34
118	2009-12-12	34
119	2009-11-12	34

	title_id	full_title	title	place	dates	capture_date	capture_timestamp
0	34	Advertiser (Adelaide, SA : 1889-1931)	Advertiser	Adelaide, SA	1889 - 1931	2009-11-12	20091112000713
1	13	Argus (Melbourne, Vic. : 1848-1954)	Argus	Melbourne, VIC	1848 - 1954	2009-11-12	20091112000713
2	16	Brisbane Courier (Qld. : 1864-1933)	Brisbane Courier	QLD	1864 - 1933	2009-11-12	20091112000713
3	11	Canberra Times (ACT : 1926-1954)	Canberra Times	ACT	1926 - 1954	2009-11-12	20091112000713
4	24	Colonial Times (Hobart, Tas. : 1828-1857)	Colonial Times	Hobart, TAS	1828 - 1857	2009-11-12	20091112000713
...	...	...	...	...	...	...	...
105023	1773	Dawn Newsletter (Perth, WA : 1952 - 1954)	Dawn Newsletter	Perth, WA	1952 - 1954	2022-01-10	20220110214554
105112	1388	La Rondine (Perth, WA : 1970 - 1974; 1983 - 1984)	La Rondine	Perth, WA	1970 - 1974; 1983 - 1984	2022-01-10	20220110214554
105121	1537	Listening Post (Perth, WA : 1921 - 1954)	Listening Post	Perth, WA	1921 - 1954	2022-01-10	20220110214554
105274	99	Western Argus (Kalgoorlie, WA : 1894 - 1895)	Western Argus	Kalgoorlie, WA	1894 - 1895	2022-01-10	20220110214554
106887	1649	North Coolgardie Herald and Miners Daily News ...	North Coolgardie Herald and Miners Daily News	Menzies, WA	1899 - 1904	2022-01-16	20220116142742

	title_id	full_title	title	place	dates	capture_date	capture_timestamp
3	11	Canberra Times (ACT : 1926-1954)	Canberra Times	ACT	1926 - 1954	2009-11-12	20091112000713
9395	11	Canberra Times (ACT : 1926 - 1995)	Canberra Times	ACT	1926 - 1995	2012-12-27	20121227113753

Gathering historical data about the addition of newspaper titles to Trove¶

Code for harvesting web archive captures¶

Harvest the title data from the Internet Archive¶

Convert the title data to a DataFrame for analysis¶

How did the number of titles change over time?¶

When did titles first appear?¶