# Let's import the libraries we need.
import io
import os
import re
import shutil
import time
import zipfile

import pandas as pd
import requests_cache
from bs4 import BeautifulSoup
from IPython.display import HTML, display
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm

s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

# Replace the value in the single quotes with the identifier of your chosen journal
journal_id = "nla.obj-320790312"
# Where do you want to save the results?
output_dir = "images"

# Set up the data directory
image_dir = os.path.join(output_dir, journal_id)
os.makedirs(image_dir, exist_ok=True)

def harvest_metadata(obj_id):
    """
    This calls an internal API from a journal landing page to extract a list of available issues.
    """
    start_url = "https://nla.gov.au/{}/browse?startIdx={}&rows=20&op=c"
    # The initial startIdx value
    start = 0
    # Number of results per page
    n = 20
    issues = []
    with tqdm(desc="Issues", leave=False) as pbar:
        # If there aren't 20 results on the page then we've reached the end, so continue harvesting until that happens.
        while n == 20:
            # Get the browse page
            response = s.get(start_url.format(obj_id, start), timeout=60)
            # Beautifulsoup turns the HTML into an easily navigable structure
            soup = BeautifulSoup(response.text, "lxml")
            # Find all the divs containing issue details and loop through them
            details = soup.find_all(class_="l-item-info")
            for detail in details:
                issue = {}
                title = detail.find("h3")
                if title:
                    issue["title"] = title.text
                    issue["id"] = title.parent["href"].strip("/")
                else:
                    issue["title"] = "No title"
                    issue["id"] = detail.find("a")["href"].strip("/")
                try:
                    # Get the issue details
                    issue["details"] = detail.find(
                        class_="obj-reference content"
                    ).string.strip()
                except (AttributeError, IndexError):
                    issue["details"] = "issue"
                # Get the number of pages
                try:
                    issue["pages"] = int(
                        re.search(
                            r"^(\d+)",
                            detail.find("a", attrs={"data-pid": issue["id"]}).text,
                            flags=re.MULTILINE,
                        ).group(1)
                    )
                except AttributeError:
                    issue["pages"] = 0
                issues.append(issue)
                # print(issue)
                if not response.from_cache:
                    time.sleep(0.5)
            # Increment the startIdx
            start += n
            # Set n to the number of results on the current page
            n = len(details)
            pbar.update(n)
    return issues


def save_page(issues, output_dir, page_num=1):
    """
    Downloads the specified page from a list of journal issues.
    If you want to download a range of pages you can set the `lastPage` parameter to your end point.
    But beware the images are pretty large.
    """
    # Loop through the issue metadata
    for issue in tqdm(issues):
        # print(issue['id'])
        id = issue["id"]
        # Check to see if the page of this issue has already been downloaded
        if not os.path.exists(
            os.path.join(image_dir, "{}-{}.jpg".format(id, page_num))
        ):
            # Change lastPage to download a range of pages
            url = "https://nla.gov.au/{0}/download?downloadOption=zip&firstPage={1}&lastPage={1}".format(
                id, page_num - 1
            )
            # Get the file
            r = s.get(url, timeout=60)
            # print(r.url, r.status_code)
            # The image is in a zip, so we need to extract the contents into the output directory
            z = zipfile.ZipFile(io.BytesIO(r.content))
            z.extractall(image_dir)
            time.sleep(0.5)

issues = harvest_metadata(journal_id)

Issues: 0it [00:00, ?it/s]

df = pd.DataFrame(issues)
df.head()

df.to_csv("{}/issues.csv".format(image_dir), index=False)

save_page(issues, image_dir, 1)

shutil.make_archive(image_dir, "zip", image_dir)
display(HTML("<b>Download results</b>"))
display(
    HTML(f'<a download="{journal_id}.zip" href="{image_dir}.zip">{image_dir}.zip</a>')
)

	title	id	details	pages
0	Angry Penguins broadsheet.	nla.obj-320791009	Collection No. 1	16
1	Angry Penguins broadsheet.	nla.obj-320791023	Collection No. 2	16
2	Angry Penguins broadsheet.	nla.obj-320791046	Collection No. 3	16
3	Angry Penguins broadsheet.	nla.obj-320791067	Collection No. 4	16
4	Angry Penguins broadsheet.	nla.obj-320791128	Collection No. 5 (May, 1946)	16

Get covers (or any other pages) from a digitised journal in Trove¶

Import what we need¶

What journal do you want?¶

Define some functions to do the work¶

Get a list of issues¶

Get the images¶

Download the results¶