Substack Downloader

This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters

Show hidden characters

	anamihalceamdphd
	karenkingston
	gregreese
	palexander
	tlavagabond
	petermcculloughmd
	corbettreport
	merylnass
	drtenpenny

view raw list.txt hosted with ❤ by GitHub

This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters

Show hidden characters

	beautifulsoup4==4.12.2
	certifi==2023.7.22
	charset-normalizer==3.2.0
	gazpacho==1.1
	idna==3.4
	markdownify==0.11.6
	Pillow==10.0.0
	requests==2.31.0
	six==1.16.0
	soupsieve==2.4.1
	urllib3==2.0.4

view raw requirements.txt hosted with ❤ by GitHub

This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters

Show hidden characters

	#!/usr/bin/env bash

	trap kill_it TERM # 15 - Termination signal
	trap kill_it PIPE # 13 - Broken pipe: write to pipe with no
	trap kill_it SEGV # 11 - Invalid memory reference
	trap kill_it KILL # 9 - Kill signal
	trap kill_it FPE # 8 - Floating point exception
	trap kill_it ABRT # 6 - Abort signal from abort(3)
	trap kill_it ILL # 4 - Illegal Instruction
	trap kill_it QUIT # 3 - Quit from keyboard
	trap kill_it INT # 2 - Interrupt from keyboard
	trap kill_it HUP # 1 - Hangup detected on controlling terminal or death of controlling process

	function kill_it() {
	echo "Killed $@";
	[ "$RUNNING" != "" ] && grep -v "$RUNNING" running.txt > running.txt;
	deactivate
	exit 1;
	}

	source .venv/bin/activate
	RUNNING="";
	for l in $(cat list.txt \| shuf); do
	echo ">>> $l";
	if [ "$(grep "$l" running.txt)" == "" ]; then
	echo $l >> running.txt;
	RUNNING="$l";
	./ssjl.py "https://$l.substack.com" "$l" --archive "$l.txt";
	grep -v "$l" running.txt > running.txt;
	RUNNING="";
	fi
	done

view raw run.sh hosted with ❤ by GitHub

This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters

Show hidden characters

	#!/usr/bin/env python3

	# python3 -m venv .venv
	# . .venv/bin/activate
	# pip3 install requests markdownify pillow gazpacho
	# ./ssjl.py https://karenkingston.substack.com ~/Downloads/karenkingston

	import os
	import sys
	import argparse
	import requests
	from requests.exceptions import HTTPError
	import markdownify
	from PIL import Image
	from gazpacho import Soup
	from time import sleep, perf_counter
	from random import randrange
	import asyncio
	import json
	from pathlib import Path

	def create_dir(directory):
	if not os.path.isdir(directory):
	if os.path.exists(directory):
	raise ValueError('Path exists: %s' % directory)
	else:
	os.makedirs(directory)

	def fetch_json(url, params):
	if '/api/v1' in url:
	endpoint = url
	else:
	endpoint = "%s/api/v1/archive" % url
	try:
	response = requests.get(endpoint, params=params)
	response.raise_for_status()
	return response.json()
	except HTTPError as err:
	print(f'HTTP error occurred: {err}')
	except Exception as err:
	print(f'Other error occurred: {err}')

	def fetch_html(url):
	try:
	response = requests.get(url)
	return response.text
	except HTTPError as err:
	print(f'HTTP error occurred: {err}')
	except Exception as err:
	print(f'Other error occurred: {err}')

	def fetch_and_parse(url, archive=None):
	try:
	limit = 12
	offset = 0
	results_len = 1
	items = []
	if archive:
	if not os.path.exists(archive):
	Path(archive).touch()
	files = open(archive, 'r').readlines()
	else:
	files = []
	while results_len != 0:
	params = {'limit': limit, 'offset': offset}
	entries = fetch_json(url, params=params)
	for item in entries:
	Link = item['canonical_url']
	if '%s\n' % os.path.basename(Link) not in files:
	Title = item['title']
	Type = item['type']
	Slug = item['slug']
	Subtitle = item['subtitle']
	Thumb = item['cover_image']
	Date = item['post_date']
	Html = fetch_html(Link)
	soup = Soup(Html)
	content = soup.find('div', {'class': 'markup'})
	if content:
	md = html2md(content.html)
	images = content.find('img')
	if Type == 'video':
	videos = content.find('div', {'id': 'media-'}, partial=True)
	else:
	videos = []
	# print(videos)
	yield {
	'title': Title,
	'subtitle': Subtitle,
	'type': Type,
	'link': Link,
	'thumb': Thumb,
	'md': md,
	'images': images,
	'videos': videos,
	'date': Date,
	}
	timeout = randrange(5, 60)
	print('Waiting: %s' % timeout)
	sleep(timeout)
	offset = limit + offset
	results_len = len(entries)
	except KeyboardInterrupt:
	sys.exit()

	def html2md(html):
	return markdownify.markdownify(html)

	def save_files(directory, items, archive=None):
	try:
	create_dir(directory)
	start = perf_counter()
	for item in items:
	print(item['title'])
	file_path = os.path.basename(item['link'])
	# with open('%s%s%s.md' % (directory, os.path.sep, file_path), 'w') as file:
	# file.write(item['md'])
	# print('File saved: %s%s%s.md' % (directory, os.path.sep, file_path))
	with open('%s%s%s.json' % (directory, os.path.sep, file_path), 'w') as file:
	file.write(json.dumps({
	'title': item['title'],
	'subtitle': item['subtitle'],
	'type': item['type'],
	'link': item['link'],
	'date': item['date'],
	'md': item['md'],
	}))
	if archive:
	with open('%s' % archive, 'a') as saved:
	saved.write('%s\n' % file_path)
	print('File saved: %s.json' % file_path)
	save_article_thumb(directory, item)
	asyncio.run(save_article_images(directory, item))
	end = perf_counter()
	print(f'It took {round(end-start, 0)} second(s) to complete.')
	except KeyboardInterrupt:
	sys.exit()

	def save_image(url, file_path):
	if url:
	data = requests.get(url).content
	ext = os.path.splitext(url)[1]
	if ext:
	with open('%s%s' % (file_path, ext), 'wb') as file:
	file.write(data)
	print('Image saved: %s%s' % (file_path, ext))

	def save_article_thumb(directory, item):
	url = item['thumb']
	if url:
	file_path = '%s%s%s' % (directory, os.path.sep, os.path.basename(item['link']))
	save_image(url, file_path)

	async def save_article_images(directory, item):
	async def download_image(url):
	if url:
	ext = os.path.splitext(url)[1]
	file_path = '%s%s%s%s%s' % (directory, os.path.sep, os.path.basename(item['link']), os.path.sep, os.path.basename(url).replace(ext, ''))
	d = os.path.dirname(file_path)
	if not os.path.isdir(d):
	os.makedirs(d)
	save_image(url, file_path)
	if item['images']:
	if type(item['images']) == list:
	urls = [img.attrs['src'] for img in item['images']]
	else:
	urls = [item['images'].attrs['src']]
	imgs = []
	for img in urls:
	imgs.append(asyncio.create_task(download_image(img)))
	await asyncio.gather(*imgs)

	def arguments():
	parser = argparse.ArgumentParser(description='Substack Downloader')
	parser.add_argument('url', help='Substack URL to download')
	parser.add_argument('dir', help='Directory where to download')
	parser.add_argument("--archive", required=False, help="Archive that saves list of downloaded files")

	args = parser.parse_args()
	return args

	if __name__ == '__main__':
	args = arguments()
	save_files(args.dir, fetch_and_parse(args.url, args.archive), args.archive)

view raw ssjl.py hosted with ❤ by GitHub

Zombies & Aliens AT THE SAME TiME

Discussion about this post

Zombies & Aliens AT THE SAME TiME

Substack Downloader

Are you trying to download substack content? I got you covered!

Discussion about this post