import os
import re
from pathlib import Path

import arrow
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from omeka_s_tools.api import OmekaAPIClient
from pyzotero import zotero
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
from trove_newspaper_images.articles import download_images

s = requests.Session()
retries = Retry(total=10, backoff_factor=1, status_forcelist=[502, 503, 504, 524])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))

load_dotenv()

# CONFIGURATION
# Things you need to change!
# Paste your values in below!

# The url of your Omeka site's api (basically your Omeka site site url with '/api' on the end)
API_URL = "http://your.omeka.site/api"

# The keys to your Omeka site
KEY_IDENTITY = "YOUR OMEKA KEY IDENTITY"
KEY_CREDENTIAL = "YOUR OMEKA KEY CREDENTIAL"

# Your Trove API key
TROVE_API_KEY = "YOUR TROVE API KEY"

# Alternatively, use api keys and settings from environment variables if available
if os.getenv("TROVE_API_KEY"):
    TROVE_API_KEY = os.getenv("TROVE_API_KEY")
if os.getenv("OMEKA_KEY_IDENTITY"):
    KEY_IDENTITY = os.getenv("OMEKA_KEY_IDENTITY")
if os.getenv("OMEKA_KEY_CREDENTIAL"):
    KEY_CREDENTIAL = os.getenv("OMEKA_KEY_CREDENTIAL")
if os.getenv("OMEKA_API_URL"):
    API_URL = os.getenv("OMEKA_API_URL")

TROVE_HEADERS = {"X-API-KEY": TROVE_API_KEY}

# Resize images so this is the max dimension -- the Trove page images are very big, so you might want to resize before uploading to Omeka
# Set this to None if you want them as big as possible (this might be useful if you're using the Omeka IIIF server & Universal viewer modules)
MAX_IMAGE_SIZE = 3000

omeka = OmekaAPIClient(
    API_URL, key_identity=KEY_IDENTITY, key_credential=KEY_CREDENTIAL
)


def get_article(article_id):
    """
    Retrieve an individual newspaper article from the Trove API.

    Parameters:
    * `article_id` - a Trove article identifier

    Returns:
    * a dict with article metadata from Trove API
    """
    url = "http://api.trove.nla.gov.au/v3/newspaper/{}".format(article_id)
    params = {"include": "articleText", "encoding": "json"}
    response = s.get(url, params=params, headers=TROVE_HEADERS)
    return response.json()


def check_for_item(item_url, template_id):
    """
    Check to see if an item exists in Omeka using schema:url values and template ids.

    Parameters:
    * `item_url` - a unique url saved in the schema:url field of an item
    * `template_id` - the Omeka id of a template to filter item results

    Returns:
    * the JSON-LD item representation if found, None if not.
    """
    # Filter items by the supplied parameters
    results = omeka.filter_items_by_property(
        filter_property="schema:url",
        filter_value=item_url,
        resource_template_id=template_id,
    )
    # Get the first record, or None if there are no matches
    try:
        item = results["results"][0]
    except (KeyError, IndexError):
        item = None
    return item


def add_newspaper(newspaper):
    """
    Check to see if the given newspaper has already been uploaded to Omeka.
    If not, upload metadata to Omeka.

    Parameters:
    * `newspaper` - this is the dict identifying the newspaper from a Trove article record

    Returns:
    * the Omeka id of the newspaper record
    """
    # Get details of the Newspaper template
    newspaper_template = omeka.get_template_by_label("Newspaper")
    template_id = newspaper_template["o:id"]
    # Get the class used with the Newspaper template
    class_id = newspaper_template["o:resource_class"]["o:id"]
    # Construct a Trove persistent url for the newspaper
    newspaper_url = f'http://nla.gov.au/nla.news-title{newspaper["id"]}'
    # Check to see if the newspaper has already been uploaded to Omeka
    newspaper_item = check_for_item(newspaper_url, template_id)
    # If it hasn't been uploaded, upload it!
    if not newspaper_item:
        # Prepare the data
        newspaper_data = {
            "schema:name": [newspaper["title"]],
            "schema:identifier": [newspaper["id"]],
            "schema:url": [newspaper_url],
        }
        # Construct the payload for upload to omeka
        payload = omeka.prepare_item_payload_using_template(newspaper_data, template_id)
        # Upload the item
        newspaper_item = omeka.add_item(
            payload, template_id=template_id, class_id=class_id
        )
    return newspaper_item["o:id"]


def add_article(article_id):
    """
    Check to see if the given article has already been uploaded to Omeka.
    If not, retrieve information about a newspaper article from Trove and
    upload the metadata, text, and images to Omeka.

    Parameters:
    * `article_id` - a Trove article identifier

    Returns:
    * a JSON-LD representation of the new Omeka item
    """
    # Get details of the newspaper article template
    article_template = omeka.get_template_by_label("Newspaper article")
    template_id = article_template["o:id"]
    # Get the resource class used with the newspaper article template
    class_id = article_template["o:resource_class"]["o:id"]
    # Construct a Trove article persistent url
    article_url = f"http://nla.gov.au/nla.news-article{article_id}"
    # Check to see if an article with this url has already been uploaded to Omeka
    article_item = check_for_item(article_url, template_id)
    # If the article hasn't been uploaded, we'll upload it!
    if not article_item:
        # Get article details from Trove
        article = get_article(article_id)
        # Format a description
        formatted_date = arrow.get(article["date"], "YYYY-MM-DD").format("D MMM YYYY")
        summary = (
            f'{formatted_date}, {article["title"]["title"]}, page {article["page"]}'
        )
        # Get the Omeka id of the newspaper it was published in
        newspaper_id = add_newspaper(article["title"])
        # Remove html tags from article text
        try:
            soup = BeautifulSoup(article["articleText"])
            article_text = soup.get_text()
        except KeyError:
            article_text = ""
        # Prepare the article metadata
        article_data = {
            "schema:name": [article["heading"]],
            "schema:description": [summary],
            "schema:datePublished": [article["date"]],
            "schema:isPartOf": [newspaper_id],
            "schema:pagination": [article["page"]],
            "schema:identifier": [article_id],
            "schema:url": [article_url],
            "schema:text": [article_text],
        }
        # Construct the payload for uploading to Omeka
        payload = omeka.prepare_item_payload_using_template(article_data, template_id)
        # Download images of the article
        article_images = download_images(article_id, output_dir="temp")
        image_paths = [Path("temp", i) for i in article_images]
        # Upload the article
        article_item = omeka.add_item(
            payload, media_files=image_paths, template_id=template_id, class_id=class_id
        )
    return article_item

def get_total_results(params):
    """
    Get the total number of results for a Trove search.
    """
    these_params = params.copy()
    these_params["n"] = 0
    response = s.get(
        "https://api.trove.nla.gov.au/v3/result",
        params=these_params,
        headers=TROVE_HEADERS,
    )
    data = response.json()
    return int(data["category"][0]["records"]["total"])


def upload_trove_search(params):
    start = "*"
    total = get_total_results(params)
    with tqdm(total=total) as pbar:
        while start:
            params["s"] = start
            response = s.get(
                "https://api.trove.nla.gov.au/v3/result",
                params=params,
                headers=TROVE_HEADERS,
            )
            data = response.json()
            # The nextStart parameter is used to get the next page of results.
            # If there's no nextStart then it means we're on the last page of results.
            try:
                start = data["category"][0]["records"]["nextStart"]
            except KeyError:
                start = None
            for article in data["category"][0]["records"]["article"]:
                add_article(article["id"])
                pbar.update(1)

# Edit/add search values and parameters as required. These are an example only!
trove_params = {
    "q": '"inigo jones"',  # required -- change to anything you might enter in the Trove search box (including double quotes for phrases and boolean operators like AND)
    "category": "newspaper",  # don't change this
    "l-artType": "newspaper",
    "l-illustrated": "true",  # edit or remove -- limits to illustrated articles
    "l-illtype": "Photo",  # edit or remove -- limits to illustrations with photos
    "l-word": "1000+ Words",  # edit or remove -- limits to article with more than 1000 words
    "include": "articleText",  # don't change this
    "encoding": "json",  # don't change this
}

upload_trove_search(trove_params)

def upload_trove_list(list_id):
    """
    Upload any newspaper articles in the given Trove list to Omeka.
    """
    url = "http://api.trove.nla.gov.au/v3/list/{}".format(list_id)
    params = {"include": "listItems", "encoding": "json", "key": TROVE_API_KEY}
    response = s.get(url, params=params)
    data = response.json()
    for item in tqdm(data["listItem"]):
        for category, record in item.items():
            if category == "article":
                add_article(record["id"])

# Paste the identifier of your list between the quotes
list_id = "[Your list ID]"
list_id = "83777"
upload_trove_list(list_id)

# ENTER YOUR VALUES BETWEEN THE QUOTES WHERE INDICATED
ZOTERO_KEY = "YOUR ZOTERO KEY"  # The Zotero API key you generated
LIBRARY_TYPE = "user"  # user or group
LIBRARY_ID = "YOUR ZOTERO ID"  # Either a personal user id or a group id

# Or you can store your information in a .env file
if os.getenv("ZOTERO_KEY"):
    ZOTERO_KEY = os.getenv("ZOTERO_KEY")
if os.getenv("ZOTERO_ID"):
    LIBRARY_ID = os.getenv("ZOTERO_ID")

def upload_zotero_collection(coll_id):
    """
    Upload any Trove newspaper articles in the given collection to Omeka.
    """
    zot = zotero.Zotero(LIBRARY_ID, LIBRARY_TYPE, ZOTERO_KEY)
    items = zot.everything(zot.collection_items(coll_id))
    articles = []
    for item in items:
        # Filter out things that aren't newspaper articles
        try:
            url = item["data"]["url"]
            if (
                item["data"]["itemType"] == "newspaperArticle"
                and "nla.news-article" in url
            ):
                article_id = re.search(r"(\d+)$", url).group(1)
                articles.append(article_id)
        except KeyError:
            pass
    for article_id in tqdm(articles):
        add_article(article_id)

# Paste your collection ID between the quotes below.
collection_id = "YOUR COLLECTION ID"
upload_zotero_collection(collection_id)

# Edit the list of articles as you see fit...
article_ids = [130413505, 65179201]

for article_id in tqdm(article_ids):
    add_article(article_id)

Upload Trove newspaper articles to Omeka-S¶

An example¶

Basic configuration¶

Generating your Omeka keys¶

Trove API key¶

Preparing Omeka-S¶

Installing the schema.org vocabulary¶

Installing the Numeric Data Type module¶

Importing the resource templates¶

Define all the functions that we need¶

Select your upload method¶

Option 1: Upload a Trove newspaper search¶

Option 2: Upload newspaper articles from a Trove list¶

Option 3: Upload Trove newspaper articles saved in Zotero¶

Option 4: Upload a list of article ids¶

Future developments¶