import json
import os
import re
import time
from pathlib import Path

import pandas as pd
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from tqdm.auto import tqdm

load_dotenv()

class SubjectHarvester:

    def __init__(
        self,
        subject_output="pandora-subjects.ndjson",
        collection_output="pandora-collections.ndjson",
        sample=None,
    ):
        self.subject_output = subject_output
        self.collection_output = collection_output
        self.sample = sample

    def get_title_ids(self, page_id):
        """
        Get the TEP identifiers for all the titles on the specified page.
        Excludes titles in subcollections as they will can be harvested separately.
        """
        title_ids = []
        page = 1
        # Subjects can have multiple pages of titles, so we'll go through page by page
        # until there's no more titles
        while page:
            response = requests.get(f"http://pandora.nla.gov.au{page_id}/{page}")
            soup = BeautifulSoup(response.text, "lxml")
            # we only want the first itemlist containing titles
            # subsequent titles will be part of subcollections
            title_links = []
            for item_list in soup.find_all("div", class_="itemlist"):
                # This checks if the title list has an h1 tag before it
                # which indicates its actually a subcollection
                if not (
                    item_list.find_previous_sibling("h1")
                    and item_list.find_previous_sibling("h1").name == "h1"
                ):
                    # Extract the TEP ids from the links
                    title_links = item_list.find_all("a", href=re.compile(r"/tep/\d+"))
                    for title_link in title_links:
                        title_ids.append(title_link["href"])
            # Continue if it's a subject page and there were title links on this page
            if title_links and "/col/" not in page_id:
                page += 1
            else:
                page = None
            time.sleep(0.5)
        return title_ids

    def harvest_subcategories(self, subject_id):
        """
        Harvest details of sub-categories from a subject page.
        """
        subject_ids = []
        # Get the subject page
        response = requests.get(f"http://pandora.nla.gov.au{subject_id}")
        soup = BeautifulSoup(response.text, "lxml")
        # Get all the links to subcategories
        subject_links = soup.find_all("a", href=re.compile(r"/subject/\d+$"))
        # Process all the sub-categories
        for subject_link in subject_links:
            subject_name = " ".join(subject_link.stripped_strings)
            subject_id = subject_link["href"]
            # Get collections
            collection_ids = self.harvest_collections(subject_id)
            # Get titles
            title_ids = self.get_title_ids(subject_id)
            with Path(self.subject_output).open("a") as subjects_file:
                subjects_file.write(
                    json.dumps(
                        {
                            "name": subject_name,
                            "id": subject_id,
                            "collections": collection_ids,
                            "titles": title_ids,
                        }
                    )
                    + "\n"
                )
            subject_ids.append(subject_id)
        return subject_ids

    def harvest_subcollections(self, coll_id, coll_name):
        """
        Harvest sub-collections from a collection page.
        """
        collection_ids = []
        # Get the collection page
        response = requests.get(f"http://pandora.nla.gov.au{coll_id}")
        soup = BeautifulSoup(response.text, "lxml")
        # Sub-collections are included in the collection pages and identified with h1 headings.
        # The h1 headings include a name attribute that is set to the sub-collection id.
        # You can use the id to request a page that just has the subcollection.
        # First get all the h1 tags
        for subc in soup.find_all("h1"):
            # Get the id value from the name attribute
            sub_link = subc.find("a", {"name": re.compile(r"\d+")})
            if sub_link:
                sub_name = sub_link.string
                # Add the collection name to the sub collection name (if it's not already there)
                if coll_name not in sub_name:
                    sub_name = f"{coll_name} - {sub_name}"
                # Use the sub-collection id to get a list of titles in the sub-collection
                sub_id = f"/col/{sub_link['name']}"
                title_ids = self.get_title_ids(sub_id)
                with Path(self.collection_output).open("a") as collections_file:
                    collections_file.write(
                        json.dumps(
                            {
                                "name": sub_name,
                                "id": sub_id,
                                "titles": title_ids,
                                "subcollections": [],
                            }
                        )
                        + "\n"
                    )
                collection_ids.append(sub_id)
        return collection_ids

    def harvest_collections(self, subject_id):
        """
        Harvest details of collections from a subject, or sub-category page.
        """
        collection_ids = []
        # Get the subject page
        response = requests.get(f"http://pandora.nla.gov.au{subject_id}")
        soup = BeautifulSoup(response.text, "lxml")
        # Get all of the links to collection pages
        collection_links = soup.find_all("a", href=re.compile(r"/col/\d+$"))
        # Process each collection page
        for coll_link in collection_links:
            coll_name = " ".join(coll_link.stripped_strings)
            coll_id = coll_link["href"]
            # Get any sub-collections
            subcollection_ids = self.harvest_subcollections(coll_id, coll_name)
            # Get titles
            title_ids = self.get_title_ids(coll_id)
            with Path(self.collection_output).open("a") as collections_file:
                collections_file.write(
                    json.dumps(
                        {
                            "name": coll_name,
                            "id": coll_id,
                            "subcollections": subcollection_ids,
                            "titles": title_ids,
                        }
                    )
                    + "\n"
                )
            collection_ids.append(coll_id)
        return collection_ids

    def harvest(self):
        """
        Start the harvest by getting the top-level subjects on the Pandora home page
        and work down the hierarchy from there.
        """
        # Remove old data files
        Path(self.subject_output).unlink(missing_ok=True)
        Path(self.collection_output).unlink(missing_ok=True)
        # Get the Pandora home page
        response = requests.get("http://pandora.nla.gov.au/")
        soup = BeautifulSoup(response.text, "lxml")
        # Find the list of subjects
        subject_list = soup.find("div", class_="browseSubjects").find_all("li")
        # Process each top-level subject
        for subject in tqdm(subject_list[: self.sample]):
            subject_link = subject.find("a")
            subject_name = " ".join(subject_link.stripped_strings)
            subject_id = subject_link["href"]
            # Get subcategories
            subcategory_ids = self.harvest_subcategories(subject_id)
            # Get collections
            subcollection_ids = self.harvest_collections(subject_id)
            # Get titles
            title_ids = self.get_title_ids(subject_id)
            with Path(self.subject_output).open("a") as subjects_file:
                subjects_file.write(
                    json.dumps(
                        {
                            "name": subject_name,
                            "id": subject_id,
                            "subcategories": subcategory_ids,
                            "collections": subcollection_ids,
                            "titles": title_ids,
                        }
                    )
                    + "\n"
                )

harvester = SubjectHarvester()
harvester.harvest()

dfc = pd.read_json("pandora-collections.ndjson", lines=True)

dfc.shape

dfc.drop_duplicates(subset=["id"], inplace=True)

dfc.shape

dfc.to_json("pandora-collections.ndjson", orient="records", lines=True)

# IGNORE CELL --TESTING ONLY
if os.getenv("GW_STATUS") == "dev":

    harvester = SubjectHarvester(
        subject_output="pandora-subjects-test.ndjson",
        collection_output="pandora-collections-test.ndjson",
        sample=1,
    )
    harvester.harvest()

    Path("pandora-subjects-test.ndjson").unlink(missing_ok=True)
    Path("pandora-collections-test.ndjson").unlink(missing_ok=True)

Harvest Pandora subjects and collections¶

Pandora vs Trove¶

Subjects, Collections, and Titles¶

Datasets¶

Remove duplicate collections¶