I recently acquired an appreciation for audiobooks. I listen to multiple
audiobooks every month and I prefer DRM-free audiobooks so that I can listen
through my favorite audiobook reader on each of my devices.
Downpour and eMusic are the only services I know that provide a large
selection of DRM-free audiobooks. Unfortunately not all audiobooks are
available on either of these sites, so I often end up searching both sites for
each book to discover what versions (if any) are available from each. I got
sick of searching both websites all the time, so I just created a Python script
to do that work for me.
#!/usr/bin/env python
"""
Search downpour.com and emusic.com for DRM-free audiobooks
Usage::
./find_audiobooks.py <title>...
File released to the public domain under CC0 license:
http://creativecommons.org/publicdomain/zero/1.0/deed
Requires purl and beautifulsoup4::
$ pip install purl beautifulsoup4
"""
from __future__ import unicode_literals
import sys
from itertools import chain, izip_longest
import urllib2
from bs4 import BeautifulSoup
from purl import URL
def unescape(text):
"""Return string without smart apostrophes"""
return text.replace('\u2019', "'")
def get_downpour_url(book_name):
"""Return search URL for downpour.com"""
base = URL("http://www.downpour.com/catalogsearch/result/")
return base.query_param('q', book_name).as_string()
def get_emusic_url(book_name):
"""Return search URL for emusic.com"""
base_url = URL("http://www.emusic.com/search/book/")
return base_url.query_param('s', book_name).as_string()
def search_downpour(book_name):
"""Search Downpour and return list of parsed results"""
response = urllib2.urlopen(get_downpour_url(book_name))
page = BeautifulSoup(response)
books = page.find_all('li', attrs={'class': "item"})
results = []
for book in books:
header = book.find(attrs={'class': "product-name"})
link_tag = header.find('a')
title = ' '.join(unescape(x)
for x in link_tag.stripped_strings)
link = link_tag['href']
for node in book.find_all(attrs={'class': 'author'}):
author_text = node.text
if author_text.startswith('By'):
author = author_text[2:].strip()
results.append({
'title': title,
'link': link,
'author': author,
})
return results
def search_emusic(book_name):
"""Search eMusic and return list of parsed results"""
response = urllib2.urlopen(get_emusic_url(book_name))
page = BeautifulSoup(response)
books = page.find_all('li', attrs={'class': "bundle"})
results = []
for book in books:
link_tag = book.find('h4').find('a')
author_tag = book.find('h5')
results.append({
'title': link_tag.text,
'link': link_tag['href'],
'author': author_tag.text,
})
return results
def print_result(result):
"""Print title, author, and link for audiobook result"""
print "Title: {}".format(result['title'])
print "Author: {}".format(result['author'])
print "Link: {}".format(result['link'])
print
def merge_lists(*lists):
"""Return merge of lists by alternating elements of each"""
combined_lists = chain.from_iterable(izip_longest(*lists))
return list(filter(bool, combined_lists))
def main(*args):
"""Search audiobook sites and return search results"""
for book_name in args:
results1 = search_downpour(book_name)
results2 = search_emusic(book_name)
results = merge_lists(results1[:3], results2[:3])
for result in results:
print_result(result)
if __name__ == "__main__":
main(*sys.argv[1:])
Future Improvements
Some ideas for future improvements:
Add Audible results when the book cannot be found in a DRM-free format
Rewrite the script in JavaScript and create a Chrome extension out of it
Use clint to colorize the command-line output