I’ve been working on a client’s new site that we’re moving from SquareSpace to a WordPress self-hosted site. Along the way, we’ve imported all of their blog posts and discovered that the images contained in each post are hosted on the SquareSpace CDN.

So that we can host those images on our own platform, I’ve prompted ChatGPT to help me come up with this Python script to download all the CDN images found in the XML export file we got from SquareSpace:

import requests
import shutil
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import logging

# Setup basic logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Parse the XML and handle namespaces
xml_file = 'example.xml'
tree = ET.parse(xml_file)
root = tree.getroot()
namespaces = {'content': 'http://purl.org/rss/1.0/modules/content/'}  # Define your namespaces

images = set()
for item in root.findall('.//item'):
    content = item.find('content:encoded', namespaces)
    if content is not None and content.text is not None:  # Check that content and content.text are not None
        soup = BeautifulSoup(content.text, 'html.parser')
        for img_tag in soup.find_all('img'):
            img_url = img_tag.get('src')
            if img_url and any(ext in img_url for ext in ['.png', '.jpg', '.gif']):
                images.add(img_url)
    else:
        logging.warning('No content found in an item or content is empty.')

if not images:
    logging.warning('No images found to download.')
else:
    logging.info(f'Found {len(images)} images to download.')

for img in images:
    try:
        logging.info(f'Downloading image from {img}')
        resp = requests.get(img, stream=True)
        if resp.status_code == 200:
            with open(f'images/{img.split("/")[-1]}', 'wb') as local_file:
                resp.raw.decode_content = True
                shutil.copyfileobj(resp.raw, local_file)
            logging.info(f'Successfully downloaded {img}')
        else:
            logging.error(f'Failed to download {img}. Status code: {resp.status_code}')
    except Exception as e:
        logging.error(f'Error downloading {img}. Error: {e}')
    finally:
        if 'resp' in locals():  # Ensure resp is defined before attempting to close it
            resp.close()