# Let's import the libraries we need.
import json
import os
import re
import time
from datetime import timedelta
from functools import reduce
from pathlib import Path

import arrow
import pandas as pd
import requests
import requests_cache
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from sqlite_utils import Database
from tqdm.auto import tqdm

r = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
r.mount("https://", HTTPAdapter(max_retries=retries))
r.mount("http://", HTTPAdapter(max_retries=retries))

s = requests_cache.CachedSession(expire_after=timedelta(days=30))
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

load_dotenv()

# Insert your Trove API key
API_KEY = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")

def get_total_results(params, headers):
    """
    Get the total number of results for a search.
    """
    these_params = params.copy()
    these_params["n"] = 0
    response = s.get(
        "https://api.trove.nla.gov.au/v3/result", params=these_params, headers=headers
    )
    data = response.json()
    return int(data["category"][0]["records"]["total"])


def get_value(record, field, keys=["value"]):
    """
    Get the values of a field.
    Some fields are lists of dicts, if so use the `key` to get the value.
    """
    value = record.get(field, [])
    if value and isinstance(value[0], dict):
        for key in keys:
            try:
                return [re.sub(r"\s+", " ", v[key]) for v in value]
            except KeyError:
                pass
    else:
        return value


def merge_values(record, fields, keys=["value"]):
    """
    Merges values from multiple fields, removing any duplicates.
    """
    values = []
    for field in fields:
        values += get_value(record, field, keys)
    # Remove duplicates and None value
    return list(set([v for v in values if v is not None]))


def flatten_values(record, field, key="type"):
    """
    If a field has a value and type, return the values as strings with this format: 'type: value'
    """
    flattened = []
    values = record.get(field, [])
    for value in values:
        if key in value:
            flattened.append(f"{value[key]}: {value['value']}")
        else:
            flattened.append(value["value"])
    return flattened


def flatten_identifiers(record):
    """
    Get a list of control numbers from the identifier field and flatten the values.
    """
    ids = {
        "identifier": [
            v
            for v in record.get("identifier", [])
            if "type" in v and v["type"] == "control number"
        ]
    }
    return flatten_values(ids, "identifier", "source")


def get_fulltext_url(links):
    """
    Loop through the identifiers to find a link to the full text version of the book.
    """
    urls = []
    for link in links:
        if (
            "linktype" in link
            and link["linktype"] == "fulltext"
            and "nla.obj" in link["value"]
            and "edeposit" in link.get("linktext", "")
        ):
            url = re.sub(r"^http\b", "https", link["value"])
            url = re.sub(r"^https://www\.", "https://", url)
            link_text = link.get("linktext", "")
            urls.append({"url": url, "link_text": link_text})
    return urls


def get_catalogue_url(links):
    """
    Loop through the identifiers to find a link to the NLA catalogue.
    """
    for link in links:
        if (
            "linktype" in link
            and link["linktype"] == "notonline"
            and "nla.cat" in link["value"]
        ):
            return link["value"]
    return ""


def has_fulltext_link(links):
    """
    Check if a list of identifiers includes a fulltext url pointing to an NLA resource.
    """
    for link in links:
        if (
            "linktype" in link
            and link["linktype"] == "fulltext"
            and "nla.obj" in link["value"]
            and "edeposit" in link.get("linktext", "")
        ):
            return True


def has_holding(holdings, nucs):
    """
    Check if a list of holdings includes one of the supplied nucs.
    """
    for holding in holdings:
        if holding.get("nuc") in nucs:
            return True


def get_digitised_versions(work):
    """
    Get the versions from the given work that have a fulltext url pointing to an NLA resource
    in the `identifier` field.
    """
    versions = []
    for version in work["version"]:
        if "identifier" in version and has_fulltext_link(version["identifier"]):
            versions.append(version)
    return versions


def get_nuc_versions(work, nucs=["ANL", "ANL:DL"]):
    """
    Get the versions from the given work that are held by the NLA.
    """
    versions = []
    for version in work["version"]:
        if "holding" in version and has_holding(version["holding"], ["ANL", "ANL:DL"]):
            versions.append(version)
    return versions


def harvest_works(
    params,
    filter_by="url",
    nucs=["ANL", "ANL:DL"],
    output_file="harvested-metadata.ndjson",
):
    """
    Harvest metadata relating to digitised works.
    The filter_by parameter selects records for inclusion in the dataset, options:
        * url -- only include versions that have an NLA fulltext url
        * nuc -- only include versions that have an NLA nuc (ANL or ANL:DL)
    """
    default_params = {
        "category": "all",
        "bulkHarvest": "true",
        "n": 100,
        "encoding": "json",
        "include": ["links", "workversions", "holdings"],
    }
    params.update(default_params)
    headers = {"X-API-KEY": API_KEY}
    total = get_total_results(params, headers)
    start = "*"
    with Path(output_file).open("w") as ndjson_file:
        with tqdm(total=total) as pbar:
            while start:
                params["s"] = start
                response = r.get(
                    "https://api.trove.nla.gov.au/v3/result",
                    params=params,
                    headers=headers,
                )
                data = response.json()
                items = data["category"][0]["records"]["item"]
                for item in items:
                    for category, record in item.items():
                        if category == "work":
                            if filter_by == "nuc":
                                versions = get_nuc_versions(record, nucs)
                            else:
                                versions = get_digitised_versions(record)
                                # Sometimes there are fulltext links on work but not versions
                                if len(versions) == 0 and has_fulltext_link(
                                    record["identifier"]
                                ):
                                    versions = record["version"]
                            for version in versions:
                                for sub_version in version["record"]:
                                    metadata = sub_version["metadata"]["dc"]
                                    # Sometimes fulltext identifiers are only available on the
                                    # version rather than the sub version. So we'll look in the
                                    # sub version first, and if they're not there use the url from
                                    # the version.
                                    # Sometimes there are multiple fulltext urls associated with a version:
                                    # eg a collection page and a publication. If so add records for both urls.
                                    # They could end up pointing to the same digitised publication, but
                                    # we can sort that out later. Aim here is to try and not miss any possible
                                    # routes to digitised publications!
                                    urls = get_fulltext_url(
                                        metadata.get("identifier", [])
                                    )
                                    if len(urls) == 0:
                                        urls = get_fulltext_url(
                                            version.get("identifier", [])
                                        )
                                    # Sometimes there are fulltext links on work but not versions
                                    if len(urls) == 0:
                                        urls = get_fulltext_url(
                                            record.get("identifier", [])
                                        )
                                    if len(urls) == 0 and filter_by == "nuc":
                                        urls = [{"url": "", "link_text": ""}]
                                    for url in urls:
                                        work = {
                                            # This is not the full set of available fields,
                                            # adjust as necessary.
                                            "title": get_value(metadata, "title"),
                                            "work_url": record.get("troveUrl"),
                                            "work_type": record.get("type", []),
                                            "contributor": merge_values(
                                                metadata,
                                                ["creator", "contributor"],
                                                ["value", "name"],
                                            ),
                                            "publisher": get_value(
                                                metadata, "publisher"
                                            ),
                                            "date": merge_values(
                                                metadata, ["date", "issued"]
                                            ),
                                            # Using merge here because I've noticed some duplicate values
                                            "type": merge_values(metadata, ["type"]),
                                            "format": get_value(metadata, "format"),
                                            "rights": merge_values(
                                                metadata, ["rights", "licenseRef"]
                                            ),
                                            "language": get_value(metadata, "language"),
                                            "extent": get_value(metadata, "extent"),
                                            "subject": merge_values(
                                                metadata, ["subject"]
                                            ),
                                            "spatial": get_value(metadata, "spatial"),
                                            # Flattened type/value
                                            "is_part_of": flatten_values(
                                                metadata, "isPartOf"
                                            ),
                                            # Only get control numbers and flatten
                                            "identifier": flatten_identifiers(metadata),
                                            "fulltext_url": url["url"],
                                            "fulltext_url_text": url["link_text"],
                                            "catalogue_url": get_catalogue_url(
                                                metadata["identifier"]
                                            ),
                                            # Could also add in data from bibliographicCitation
                                            # Although the types used in citations seem to vary by work and format.
                                        }
                                        ndjson_file.write(f"{json.dumps(work)}\n")
                # The nextStart parameter is used to get the next page of results.
                # If there's no nextStart then it means we're on the last page of results.
                try:
                    start = data["category"][0]["records"]["nextStart"]
                except KeyError:
                    start = None
                pbar.update(len(items))

params = {
    "q": '"nla.obj" nuc:"ANL:NED"',
    "l-format": "Periodical",  # Journals only
    # "l-availability": "y",
}

harvest_works(params, output_file="ned-periodicals.ndjson")

# get the current list of ids for comparison
# loop through titles
# get page type
# if page type is pdf, check if id == parent_id
# if it's an issue with a parent, check that the parent is in the set of titles
# if not try to get some details of the parent and add to title dataset


def get_metadata(id):
    """
    Extract work data in a JSON string from the work's HTML page.
    """
    if not id.startswith("http"):
        id = "https://nla.gov.au/" + id
    response = s.get(id)
    try:
        work_data = re.search(
            r"var work = JSON\.parse\(JSON\.stringify\((\{.*\})", response.text
        ).group(1)
    except AttributeError:
        work_data = "{}"
    if not response.from_cache:
        time.sleep(0.2)
    return json.loads(work_data)


def get_iso_date(date):
    if date:
        iso_date = arrow.get(date, "ddd, D MMM YYYY").format("YYYY")
    else:
        iso_date = None
    return iso_date


def create_title_from_metadata(id):
    if not id.startswith("http"):
        id = "https://nla.gov.au/" + id
    metadata = get_metadata(id)
    title = {
        "title": metadata.get("title", ""),
        "contributor": [metadata.get("creator", "")],
        "publisher": metadata.get("publisherName", ""),
        "date": [get_iso_date(metadata.get("issueDate", None))],
        "extent": metadata.get("extent", ""),
        "rights": metadata.get("copyrightPolicy", ""),
        "identifier": metadata.get("standardIds", []),
        "fulltext_url": id,
        "type": [],
        "format": [],
        "language": [],
        "subject": [],
        "spatial": [],
        "is_part_of": [],
        "work_url": "",
        "work_type": "",
        "fulltext_url_text": "",
        "catalogue_url": "",
    }
    return title


def get_page_type(url):
    response = s.get(url)
    soup = BeautifulSoup(response.text)
    page_type = soup.find("meta", attrs={"data-screen-id": True})["data-screen-id"]
    return page_type


def check_titles(
    input="ned-periodicals.ndjson", output="ned-periodicals-checked.ndjson"
):
    df = pd.read_json(input, lines=True)
    df["id"] = df["fulltext_url"].apply(lambda x: x.strip("/").split("/")[-1])
    # df.fillna("", inplace=True)
    with Path(output).open("w") as ndjson_file:
        for title in tqdm(df.to_dict(orient="records"), total=df.shape[0]):
            url = title["fulltext_url"]
            page_type = get_page_type(url)
            # Keep title landing pages
            if page_type in ["Preview Landing Page", "Onsite Landing Page"]:
                # keep this in titles
                ndjson_file.write(f"{json.dumps(title)}\n")
            # Drop not found pages
            elif page_type != "Page Not Found":
                metadata = get_metadata(url)
                parent_id = metadata["topLevelCollection"]
                pid = metadata["pid"]
                # This page has a parent, so it's not a title
                if parent_id != pid:
                    # It's parent isn't in the current dataset
                    if df.loc[df["id"] == parent_id].empty:
                        # print("parent not found")
                        # add a record for the parent
                        new_title = create_title_from_metadata(parent_id)
                        ndjson_file.write(f"{json.dumps(new_title)}\n")
                        # add details of parent to titles
                    # else:
                    #    print("parent found")

                else:
                    if page_type == "Ebook Page":

                        ndjson_file.write(f"{json.dumps(title)}\n")
                        # keep this in titles
                        # need to do another check when getting issues
                    elif page_type == "Picture Viewer Page":
                        pass
                        # print(url, "picture")
                        # ignore
            else:
                print(url, "not found")

check_titles()

def merge_column(columns):
    values = []
    for value in columns:
        if isinstance(value, list):
            values += [str(v) for v in value if v]
        elif value:
            values.append(str(value))
    return " | ".join(sorted(set(values)))


def merge_records(df):
    # df["pages"].fillna(0, inplace=True)
    # df.fillna("", inplace=True)
    # df["pages"] = df["pages"].astype("Int64")

    # Add base dataset with columns that will always have only one value
    dfs = [df[["fulltext_url"]].drop_duplicates()]

    # Columns that potentially have multiple values which will be merged
    columns = [
        "title",
        "work_url",
        "work_type",
        "contributor",
        "publisher",
        "date",
        "type",
        "format",
        "extent",
        "language",
        "subject",
        "spatial",
        "is_part_of",
        "identifier",
        "rights",
        "fulltext_url_text",
        "catalogue_url",
    ]

    # Merge values from each column in turn, creating a new dataframe from each
    for column in columns:
        dfs.append(
            df.groupby(["fulltext_url"])[column].apply(merge_column).reset_index()
        )

    # Merge all the individual dataframes into one, linking on `text_file` value
    df_merged = reduce(
        lambda left, right: pd.merge(left, right, on=["fulltext_url"], how="left"), dfs
    )
    return df_merged

df = pd.read_json("ned-periodicals-checked.ndjson", lines=True)

df.shape

(9474, 19)

df_merged = merge_records(df)

# How many journals are there?
df_merged.shape[0]

8572

def save_ned_titles(df, output="ned-periodicals.csv"):
    df["id"] = df["fulltext_url"].apply(lambda x: x.strip("/").split("/")[-1])
    df_titles = df[
        [
            "id",
            "title",
            "contributor",
            "publisher",
            "date",
            "fulltext_url",
            "work_url",
            "work_type",
            "type",
            "format",
            "extent",
            "language",
            "subject",
            "spatial",
            "is_part_of",
            "identifier",
            "rights",
            "catalogue_url",
        ]
    ]

    df_titles.to_csv(output, index=False)
    return df_titles

df_titles = save_ned_titles(df_merged)

def get_issues(parent_id):
    """
    Get the ids of issues that are children of the current record.
    """
    start_url = "https://nla.gov.au/{}/browse?startIdx={}&rows=20&op=c"
    # The initial startIdx value
    start = 0
    # Number of results per page
    n = 20
    parts = []
    # If there aren't 20 results on the page then we've reached the end, so continue harvesting until that happens.
    while n == 20:
        # Get the browse page
        response = s.get(start_url.format(parent_id, start))
        # Beautifulsoup turns the HTML into an easily navigable structure
        soup = BeautifulSoup(response.text, "lxml")
        # Find all the divs containing issue details and loop through them
        details = soup.find_all(class_="l-item-info")
        for detail in details:
            title = detail.find("h3")
            if title:
                issue_id = title.parent["href"].strip("/")
            else:
                issue_id = detail.find("a")["href"].strip("/")
            # Get the issue id
            parts.append(issue_id)
        if not response.from_cache:
            time.sleep(0.2)
        # Increment the startIdx
        start += n
        # Set n to the number of results on the current page
        n = len(details)
    return parts


def harvest_all_issues(input="ned-periodicals.csv", output="ned-issues.ndjson"):
    df = pd.read_csv(input)
    with Path(output).open("w") as ndjson_file:
        for title in tqdm(df.itertuples(), total=df.shape[0]):
            # title_id = title.fulltext_url.strip("/").split("/")[-1]
            title_id = title.id
            page_type = get_page_type(title.fulltext_url)
            if page_type == "Ebook Page":
                issues = [title.fulltext_url]
            else:
                issues = get_issues(title_id)
            for issue_id in issues:
                metadata = get_metadata(issue_id)
                try:
                    issue = {
                        "id": metadata["pid"],
                        "title_id": title_id,
                        "title": metadata["title"],
                        "description": metadata.get("subUnitNo", ""),
                        "date": get_iso_date(metadata.get("issueDate", None)),
                        "url": f"https://nla.gov.au/{metadata['pid']}",
                        "ebook_type": metadata.get("ebookType", ""),
                        "access_conditions": metadata.get("accessConditions", ""),
                        "copyright_policy": metadata.get("copyrightPolicy", ""),
                    }
                except KeyError:
                    print(title_id)
                else:
                    ndjson_file.write(f"{json.dumps(issue)}\n")

harvest_all_issues()

df_issues = pd.read_json(
    "ned-issues.ndjson", convert_dates=False, dtype={"date": "Int64"}, lines=True
)

df_issues.to_csv("ned-periodical-issues.csv", index=False)

df_issues.shape

(179510, 9)

df_totals = (
    df_issues.loc[df_issues["access_conditions"] == "Unrestricted"]
    .groupby(["title_id", "title"])
    .size()
    .to_frame()
    .reset_index()
)

df_totals.sort_values(0, ascending=False)[:20]

df_issues["access_conditions"].value_counts()

access_conditions
Unrestricted    155783
View Only        15118
Onsite Only       8609
Name: count, dtype: int64

df_issues["ebook_type"].value_counts()

ebook_type
application/pdf         178553
                           838
application/epub+zip       119
Name: count, dtype: int64

def add_download_link(row):
    url = ""
    if row["access_conditions"] == "Unrestricted":
        url = f"https://nla.gov.au/{row['id']}/download?downloadOption=eBook&firstPage=-1&lastPage=-1"
    return url


df_issues["download_link"] = df_issues.apply(add_download_link, axis=1)

db = Database("ned-periodicals.db", recreate=True)
df_titles.insert(
    0,
    "thumbnail",
    df_titles["fulltext_url"].apply(
        lambda x: f'{{"img_src": "{x + "-t"}"}}' if not pd.isnull(x) else ""
    ),
)
db["titles"].insert_all(df_titles.to_dict(orient="records"), pk="id")
db["titles"].enable_fts(["title", "contributor", "publisher", "subject"])


df_issues.insert(
    0,
    "thumbnail",
    df_issues["url"].apply(
        lambda x: f'{{"img_src": "{x + "-t"}"}}' if not pd.isnull(x) else ""
    ),
)
df_issues = df_issues.drop("title", axis=1)
db["issues"].insert_all(df_issues.to_dict(orient="records"), pk="id")
db["issues"].add_foreign_key("title_id", "titles", "id")

# IGNORE THIS CELL -- FOR TESTING ONLY
if os.getenv("GW_STATUS") == "dev":
    df_test = pd.read_json("ned-periodicals.ndjson", lines=True)[:20]
    df_merged_test = merge_records(df_test)
    df_titles_test = save_ned_titles(df_merged_test, "ned-periodicals-test.csv")
    harvest_all_issues(
        input="ned-periodicals-test.csv", output="ned-periodicals-issues-test.ndjson"
    )

    Path("ned-periodicals-test.csv").unlink()
    Path("ned-periodicals-issues-test.ndjson").unlink()

	title_id	title	0
1737	nla.obj-1916881555	Western Australian government gazette.	2021
2598	nla.obj-2692666983	APSjobs-vacancies daily ... daily gazette.	1255
4424	nla.obj-2940864261	The Australian Jewish News.	1067
4448	nla.obj-2945379691	Tweed link	880
2201	nla.obj-2541626239	Weekly notice	798
34	nla.obj-1252109725	Queensland Health services bulletin	745
4423	nla.obj-2940863963	The Australian Jewish News.	726
16	nla.obj-1247944368	Hyden Karlgarin Householder News.	680
752	nla.obj-1775015332	E-record : your news from across the Archdioce...	679
7761	nla.obj-638303044	Class ruling	648
2191	nla.obj-2536144595	Plantagenet news.	594
3383	nla.obj-2815835489	The Apollo Bay news.	560
5642	nla.obj-3125539859	The Peninsula community access news.	528
3939	nla.obj-2859788676	Council news : weekly information from us to you	520
184	nla.obj-1252305285	Clermont rag : Community newspaper.	514
1710	nla.obj-1908935587	Assessment reports and exam papers	512
42	nla.obj-1252119874	Rot-Ayr-Ian [electronic resource] : the offici...	467
140	nla.obj-1252246096	Palm Island Voice.	454
4886	nla.obj-2994765231	Townsville Orchid Society Inc. bulletin.	452
4459	nla.obj-2949797877	Short list	431

Harvest details of periodicals submitted to Trove through the National edeposit scheme (NED)¶

Add your Trove API key¶

Define some functions to do the work¶

Harvest periodical titles¶

Check that they're not really issues¶

Remove duplicates¶

Get details of issues¶

Explore the data¶

Create an SQLite database¶