import hashlib
import json
import os
import re
import time
from pathlib import Path

import pandas as pd
import requests
import requests_cache
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from slugify import slugify
from tqdm.auto import tqdm

s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

load_dotenv()

True

# Insert your Trove API key
API_KEY = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")

# Insert your query between the single quotes.
query = "ufo"
# Examples:
# query = '("illegal arrival" OR text:"immigrant" OR text:"immigrants" OR "asylum seeker" OR "boat people" OR refugee OR "boat arrivals")'
# query = "(COVID OR coronavirus)"

# You don't have to change this
output_dir = "press-releases"

def get_total_results(params):
    """
    Get the total number of results for a search.
    """
    these_params = params.copy()
    these_params["n"] = 0
    response = s.get("https://api.trove.nla.gov.au/v3/result", params=these_params)
    data = response.json()
    return int(data["category"][0]["records"]["total"])


def get_fulltext_url(links):
    """
    Loop through the identifiers to find a link to the digital version of the journal.
    """
    url = None
    for link in links:
        if link["linktype"] == "fulltext":
            url = link["value"]
            break
    return url


def get_source(version):
    """
    Get the metadata source of a version.
    """
    if "metadataSource" in version:
        try:
            source = version["metadataSource"]["value"]
        except TypeError:
            try:
                source = version["metadataSource"]
            except TypeError:
                print(version)

        except KeyError:
            source = None
    else:
        source = None
    return source


def get_value(record, field, keys=["value"]):
    """
    Get the values of a field.
    Some fields are lists of dicts, if so use the `key` to get the value.
    """
    value = record.get(field, [])
    if value and isinstance(value[0], dict):
        for key in keys:
            try:
                return [re.sub(r"\s+", " ", v[key]) for v in value]
            except KeyError:
                pass
    else:
        return value


def merge_values(record, fields, keys=["value"]):
    """
    Merges values from multiple fields, removing any duplicates.
    """
    values = []
    for field in fields:
        values += get_value(record, field, keys)
    # Remove duplicates and None value
    return list(set([v for v in values if v is not None]))


def flatten_values(record, field, key="type"):
    """
    If a field has a value and type, return the values as strings with this format: 'type: value'
    """
    flattened = []
    values = record.get(field, [])
    for value in values:
        if key in value:
            flattened.append(f"{value[key]}: {value['value']}")
        else:
            flattened.append(value["value"])
    return flattened


def harvest_prs(query):
    """
    Harvest details of parliamentary press releases using the Trove API.
    This function saves the 'version' level records individually (these are grouped under 'works').
    """
    # Define parameters for the search -- you could change this of course
    # The nuc:"APAR:PR" limits the results to the Parliamentary Press Releases
    params = {
        "q": f'nuc:"APAR:PR" AND ({query})',
        "category": "all",
        "n": 100,
        "bulkHarvest": "true",
        "encoding": "json",
        "include": "workVersions",
        "l-availability": "y",
    }
    start = "*"
    total = get_total_results(params)
    url = "http://api.trove.nla.gov.au/v3/result"
    with tqdm(total=total) as pbar:
        with Path(f"press-releases-{slugify(query)}.ndjson").open("w") as ndjson_out:
            while start:
                params["s"] = start
                response = s.get(url, params=params)
                data = response.json()
                # If there's a startNext value then we get it to request the next page of results
                try:
                    start = data["category"][0]["records"]["nextStart"]
                except KeyError:
                    start = None
                items = data["category"][0]["records"]["item"]
                for item in items:
                    for category, record in item.items():
                        if category == "work":
                            # Different records can be grouped within works as versions.
                            # So we're going to extract each version as a separate record.
                            for version in record["version"]:
                                # Sometimes there are even versions grouped together in a version... ¯\_(ツ)_/¯
                                # We need to extract their ids from a single string
                                ids = version["id"].split()
                                # Loop through versions in versions.
                                for index, sub_version in enumerate(version["record"]):
                                    source = get_source(sub_version)
                                    if source == "APAR:PR":
                                        metadata = sub_version["metadata"]["dc"]
                                        work = {
                                            "version_id": ids[index],
                                            "work_id": record["id"],
                                            "work_type": record.get("type", []),
                                            "title": get_value(metadata, "title"),
                                            "contributor": merge_values(
                                                metadata,
                                                ["creator", "contributor"],
                                                ["value", "name"],
                                            ),
                                            "date": merge_values(
                                                metadata, ["date", "issued"]
                                            ),
                                            "description": get_value(
                                                metadata, "description"
                                            ),
                                            # Using merge here because I've noticed some duplicate values
                                            "type": merge_values(metadata, ["type"]),
                                            "format": get_value(metadata, "format"),
                                            "language": get_value(metadata, "language"),
                                            "extent": get_value(metadata, "extent"),
                                            "rights": merge_values(
                                                metadata, ["rights", "licenseRef"]
                                            ),
                                            "subject": merge_values(
                                                metadata, ["subject"]
                                            ),
                                            # Flattened type/value
                                            "is_part_of": flatten_values(
                                                metadata, "isPartOf"
                                            ),
                                            "fulltext_url": get_fulltext_url(
                                                metadata["identifier"]
                                            ),
                                        }
                                        ndjson_out.write(
                                            f"{json.dumps(work, ensure_ascii=False)}\n"
                                        )
                pbar.update(100)


def save_texts(query, output_dir="press-releases"):
    """
    Get the text of press releases in the ParlInfo db.
    This function uses urls harvested from Trove to request press releases from Parlinfo.
    Text is extracted from the HTML files and saved as individual text files.
    """
    input = Path(f"press-releases-{slugify(query)}.ndjson")
    output_path = Path(output_dir, f"press-releases-{slugify(query)}", "text")
    output_path.mkdir(parents=True, exist_ok=True)
    # ParlInfo requires a user-agent
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/119.0",
    }
    total = sum(1 for _ in open(input))
    with input.open("r") as ndjson_in:
        for line in tqdm(ndjson_in, total=total):
            record = json.loads(line)
            contributor = slugify(
                record["contributor"][0] if record["contributor"] else "unknown"
            )
            filename = f"{record['date'][0]}-{contributor}-{record['version_id']}.txt"
            file_path = Path(output_path, filename)
            # Only save files we haven't saved before
            if not file_path.exists():
                # Get the Parlinfo web page
                response = requests.get(record["fulltext_url"], headers=headers)
                # Parse web page in Beautiful Soup
                soup = BeautifulSoup(response.text, "lxml")
                content = soup.find("div", class_="box")
                # If we find some text on the web page then save it.
                if content:
                    # Open file
                    # print 'Saving file...'
                    with open(file_path, "w", encoding="utf-8") as text_file:
                        # Get the contents of each paragraph and write it to the file
                        for para in content.find_all("p"):
                            text_file.write("{}\n\n".format(para.get_text().strip()))
                else:
                    # No content could be an error at APH
                    print(response.url)
                time.sleep(10)

records = harvest_prs(query)

save_texts(query, output_dir)

df = pd.read_json(f"press-releases-{slugify(query)}.ndjson", lines=True)
df.head()

# How many records
df.shape[0]

12

print(
    sum(
        1
        for _ in Path(output_dir, f"press-releases-{slugify(query)}", "text").glob(
            "*.txt"
        )
    )
)

14

# Change this!
# pattern = r"(covid|coronavirus)"
pattern = rf"\b{query}\b"

for text_file in Path(output_dir, f"press-releases-{slugify(query)}", "text").glob(
    "*.txt"
):
    # Are our search terms in the file?
    if re.findall(pattern, text_file.read_text().lower()) == []:
        # Get the version id
        version_id = re.search(r"\-(\d+)\.txt", text_file.name).group(1)
        # Remove the record with that version_id from the dataset
        df = df.loc[df["version_id"] != int(version_id)]
        # Delete the text file
        text_file.unlink()

df.shape[0]

8

print(
    sum(
        1
        for _ in Path(output_dir, f"press-releases-{slugify(query)}", "text").glob(
            "*.txt"
        )
    )
)

10

def get_hash(version_id):
    try:
        text_file = next(
            Path(output_dir, f"press-releases-{slugify(query)}", "text").glob(
                f"*-{version_id}.txt"
            )
        )
        hashed = hashlib.sha1(text_file.read_text().encode()).hexdigest()
    except StopIteration:
        print(version_id)
        hashed = None
    return hashed


df["hash"] = df["version_id"].apply(get_hash)

df["hash"].nunique()

8

def merge_lists(column):
    try:
        return column.apply(lambda x: "|".join(x) if isinstance(x, list) else x)
    except AttributeError:
        return column


df = df.apply(merge_lists)

# Add a Trove link to each work/version
df["trove_url"] = df.apply(
    lambda x: f"https://trove.nla.gov.au/work/{x['work_id']}/version/{x['version_id']}",
    axis=1,
)

# Save the data as a CSV file
output_path = Path(output_dir, f"press-releases-{slugify(query)}")
output_path.mkdir(parents=True, exist_ok=True)
df[
    [
        "title",
        "contributor",
        "date",
        "description",
        "type",
        "format",
        "work_type",
        "language",
        "extent",
        "rights",
        "subject",
        "is_part_of",
        "fulltext_url",
        "trove_url",
        "work_id",
        "version_id",
        "hash",
    ]
].to_csv(Path(output_path, "results.csv"), index=False)

	version_id	work_id	work_type	title	contributor	date	description	type	format	language	extent	rights	subject	is_part_of	fulltext_url
0	211350983	193080997	[Article/Other article, Article]	[Transcript of joint press conference with New...	[Rudd, Kevin, Mccully, Murray]	[2011-03-26]	[A press release issued by a member of the Aus...	[Press Release]	[]	[eng]	[5p.]	[Copyright remains with the copyright holder. ...	[visit to New Zealand, Syria, New Zealand dome...	[Press releases database, Australian Parliamen...	http://parlinfo.aph.gov.au/parlInfo/search/dis...
1	211352306	193082252	[Sound/Other sound, Sound]	[Transcript of interview with the Hot Breakfas...	[the Hot Breakfast Team, Abbott, Tony]	[2011-05-06]	[A press release captured for archiving in the...	[Press Release, Broadcast transcript]	[]	[eng]	[7p.]	[Copyright remains with the copyright holder. ...	[Julia Gillard's carbon tax]	[Triple M, Adelaide, Press releases database, ...	http://parlinfo.aph.gov.au/parlInfo/search/dis...
2	211479325	193193664	[Article/Other article, Article]	[St. Clair 'the truth is out there']	[St Clair, Stuart, National Party of Australia]	[1999-12-09]	[A press release captured for archiving in the...	[Press Release]	[Online Text]	[eng]	[]	[Copyright remains with the copyright holder. ...	[]	[Press releases database, Australian Parliamen...	http://parlinfo.aph.gov.au/parlInfo/search/dis...
3	213729661	195167931	[Article/Other article, Article]	[Address to Federal Council - Perth]	[Fraser, Malcolm]	[1979-04-22]	[A press release issued by a member of the Aus...	[Press Release, Speech]	[]	[eng]	[]	[Copyright remains with the copyright holder. ...	[]	[Press releases database, Australian Parliamen...	http://parlinfo.aph.gov.au/parlInfo/search/dis...
4	213769918	195229143	[Article/Other article, Article]	[Information Technology- a Critical View]	[Jones, Barry]	[1981-08-22]	[A press release issued by a member of the Aus...	[Press Release, Speech]	[]	[eng]	[]	[Copyright remains with the copyright holder. ...	[]	[Press releases database, Australian Parliamen...	http://parlinfo.aph.gov.au/parlInfo/search/dis...

Harvest parliament press releases from Trove¶

Duplicates and false positives¶

An example – politicians talking about 'immigrants' and 'refugees'¶

Import the libraries we'll need¶

Set your options¶

Define some functions to do the work¶

Harvest the metadata!¶

Download the text files¶

Convert to a dataframe¶

Removing non-matches¶

Find press releases with duplicate content¶

Save the dataset¶