In the beginning of December last year I realized I really liked the music being played between the subtopics in the Freakonimics podcast. I found a forum posting asking for a convenient way to get access to the music, but the only solution was that the tracks was being added in the transcripts in [MUSIC...] tags.

So, how to get access to them via Spotify:

Since the information we are looking for, artist and track name, is available in a predefined format, all that is needed it to scrape the site meaning having a program visit each page and collect the tags. Then the tags must be cleaned to separate out noise, and lastly they must be searched for in the Spotify network and added to a playlist.

The first step is quite easy using a web crawler. I reused some modified Python code for crawling, and made a simple parser of the content. The output is a comma separated list of artist and song name. In order to transform this information to a Spotify playlist there is a cool online tool called Ivy that will do just that taking the prepared input.

Ivy was able to find 60 of the 136 unique songs at the time of the "experiment".

This is the initial code:

import httplib
import re
import sys

from posixpath import join, dirname, normpath
from threading import Thread, Lock
from urllib import quote

class Document(object):
def __init__(self, res, url):
self.url = url
self.query = '' if not '?' in url else url.split('?')[-1]
self.status = res.status
self.text = res.read()

class Crawler(object):
A Crawler that crawls through cplusplus.com
def __init__(self):
self.host = None
self.visited = {}
self.targets = set()
self.threads = []
self.concurrency = 0
self.max_outstanding = 1 # ORIGINAL 16

self.follow_mode = self.F_SAME_HOST
self.content_type_filter = '(text/html)'
self.url_filters = []
self.prefix_filter = '^(#|javascript:|mailto:)'

self.targets_lock = Lock()
self.concurrency_lock = Lock()

def set_content_type_filter(self, cf):
self.content_type_filter = '(%s)' % ('|'.join(cf))

def add_url_filter(self, uf):

def set_follow_mode(self, mode):
if mode > 5:
raise RuntimeError('invalid follow mode.')
self.follow_mode = mode

def set_concurrency_level(self, level):
self.max_outstanding = level

def process_document(self, doc):
print 'GET', doc.status, doc.url

def crawl(self, url):
self.root_url = url

rx = re.match('(https?://)([^/]+)([^\?]*)(\?.*)?', url)
self.proto = rx.group(1)
self.host = rx.group(2)
self.path = rx.group(3)
self.dir_path = dirname(self.path)
self.query = rx.group(4)


while self.threads:
for t in self.threads:
if not t.isAlive():
except KeyboardInterrupt, e:

def _url_domain(self, host):
parts = host.split('.')
if len(parts) <= 2:
return host
elif re.match('^[0-9]+(?:\.[0-9]+){3}$', host): # IP
return host
return '.'.join(parts[1:])

def _follow_link(self, url, link):
# Skip prefix
if re.search(self.prefix_filter, link):
return None

# Filter url
for f in self.url_filters:
if re.search(f, link):
return None

rx = re.match('(https?://)([^/:]+)(:[0-9]+)?([^\?]*)(\?.*)?', url)
url_proto = rx.group(1)
url_host = rx.group(2)
url_port = rx.group(3) if rx.group(3) else ''
url_path = rx.group(4) if len(rx.group(4)) > 0 else '/'
url_dir_path = dirname(url_path)

rx = re.match('((https?://)([^/:]+)(:[0-9]+)?)?([^\?]*)(\?.*)?', link)
link_full_url = rx.group(1) != None
link_proto = rx.group(2) if rx.group(2) else url_proto
link_host = rx.group(3) if rx.group(3) else url_host
link_port = rx.group(4) if rx.group(4) else url_port
link_path = quote(rx.group(5), '/%') if rx.group(5) else url_path
link_query = quote(rx.group(6), '?=&%') if rx.group(6) else ''
link_dir_path = dirname(link_path)

if not link_full_url and not link.startswith('/'):
link_path = normpath(join(url_dir_path, link_path))

link_url = link_proto + link_host + link_port + link_path + link_query

if self.follow_mode == self.F_ANY:
return link_url
elif self.follow_mode == self.F_SAME_DOMAIN:
return link_host if self._url_domain(self.host) == \
self._url_domain(link.host) else None
elif self.follow_mode == self.F_SAME_HOST:
return link_url if self.host == link_host else None
elif self.follow_mode == self.F_SAME_PATH:
if self.host == link_host and \
return link_url
return None

def _add_target(self, target):
if not target:

if self.visited.has_key(target):

def _spawn_new_worker(self):
self.concurrency += 1
t = Thread(target=self._worker, args=(self.concurrency,))
t.daemon = True

def _worker(self, sid):
while self.targets:
url = self.targets.pop()
self.visited[url] = True

rx = re.match('https?://([^/]+)(.*)', url)
host = rx.group(1)
path = rx.group(2)

conn = httplib.HTTPConnection(host, timeout=10)
conn.request('GET', path)
res = conn.getresponse()

if res.status == 301 or res.status == 302:
rlink = self._follow_link(url, res.getheader('location'))

# Check content type
if not re.search(self.content_type_filter,
except TypeError: # getheader result is None

doc = Document(res, url)

# Make unique list
links = re.findall('''href\s*=\s*['"]\s*([^'"]+)['"]''',
doc.text, re.S)
links = list(set(links))

for link in links:
if re.search('''freakonomics.com/\d{4}/\d{2}/\d{2}/([a-z-]+)/$''', link, re.I): # ADDED
rlink = self._follow_link(url, link.strip())

if self.concurrency < self.max_outstanding:
except KeyError as e:
# Pop from an empty set
except (httplib.HTTPException, EnvironmentError) as e:
#print '%s, retrying' % str(e)

self.concurrency -= 1

from creepy import Crawler
import re
import sys
import time
from random import randint

Crawl pages that looks like podcast scripts given the URL.
** should be fixed to only crawl all links on the first page. This thing will now keep going on...
modifications has also been done in the creepy.py file by AZ Huang <aitjcize@gmail.com>
setting threads to 1
restricting the kinds of URLs allowed to be added to the queue

class MyCrawler(Crawler):
def process_document(self, doc):
if doc.status == 200:
print '*** [%d] %s' % (doc.status, doc.url)
songs = re.findall('''\[\s*MUSIC\s*:([^\]]+)\]''', doc.text, re.I)
# https://pythex.org/
for song in songs:
print ("*%s") % song

sleep_time = randint(500, 2000) / 1000.0
time.sleep(sleep_time) # be nice to the server

crawler = MyCrawler()
crawler.crawl('http://freakonomics.com/radio/freakonomics-radio-podcast-archive/') # the podcast overview page

# -*- coding: utf-8 -*-

Simple cleaning of the result of the crawler output
usage: pipe result to a file and upload to Ivy or similar service for Spotify links

import sys
import re
import HTMLParser

h = HTMLParser.HTMLParser()

filename = sys.argv[1]
except IndexError:
sys.exit("Usage: %s filename") % (sys.argv[0])
with open(filename) as data:
lines = data.read().splitlines()

all_songs = []

for index, line in enumerate(lines):
if line[0:3] != "***": # just for knowing what episode the song was fetched from

line = unicode(line, "utf-8")
line = h.unescape(line)
line = re.sub('''^\*\s{0,2}''', r'', line)
line = re.sub('''<[^>]+>''', r'', line)
line = re.sub('''\([^\)]+\)''', r'', line)

line = line.replace(";",",")
line = line.replace(" -",",")

line = line.encode('ascii', 'ignore')

all_songs = sorted(set(all_songs))
for item in all_songs:
print item

The result of the scripts running:

size 180.0 KiB
sha256: 57311cd838...f6dc113136

size 4.7 KiB
sha256: 6eb04b6133...8b4bfe6a64

