webscraping documentation

Introduction

Background

For the last few years I have been specializing at web scraping and collected what I found useful into this library.

All code is pure Python and has been run across multiple Linux servers, Windows machines, as well as Google App Engine.

Install

Some options to install the webscraping package:

  1. Checkout the repository: hg clone https://code.google.com/p/webscraping/

  2. Download the zip: https://pypi.python.org/pypi/webscraping/

  3. Install with pypi: pip install webscraping

The only dependency is python 2.5 or higher.

License

This code is licensed under the LGPL license.

Examples

Simple extraction

Except project title from the Google Code page:

from webscraping import download, xpath
D = download.Download()
# download and cache the Google Code webpage
html = D.get('http://code.google.com/p/webscraping')
# use xpath to extract the project title
project_title = xpath.get(html, '//div[@id="pname"]/a/span')

Blog scraper

Scrape all articles from a blog

import itertools
import urlparse
from webscraping import common, download, xpath

DOMAIN = ...
writer = common.UnicodeWriter('articles.csv')
writer.writerow(['Title', 'Num reads', 'URL'])
seen_urls = set() # track which articles URL's already seen, to prevent duplicates
D = download.Download()

# iterate each of the categories
for category_link in ('/developer/knowledge-base?page=%d', '/developer/articles?page=%d'):
    # iterate the pages of a category
    for page in itertools.count():
        category_html = D.get(urlparse.urljoin(DOMAIN, category_link % page))
        article_links = xpath.search(category_html, '//div[@class="morelink"]/a/@href')
        num_new_articles = 0
        for article_link in article_links:
            # scrape each article
            url = urlparse.urljoin(DOMAIN, article_link)
            if url not in seen_urls:
                num_new_articles += 1
                seen_urls.add(url)
                html = D.get(url)
                title = xpath.get(html, '//div[@class="feed-header-wrap"]/h2')
                num_reads = xpath.get(html, '//li[@class="statistics_counter last"]/span').replace(' reads', '')
                row = title, num_reads, url
                writer.writerow(row)
        if num_new_articles == 0:
            break # have found all articles for this category

Business directory threaded scraper

Scrape all businesses from this popular directory

import csv
import re
import string
from webscraping import common, download, xpath

DOMAIN = ...

class BusinessDirectory:
    def __init__(self, output_file='businesses.csv'):
        self.writer = common.UnicodeWriter(output_file)
        self.writer.writerow(['Name', 'Address'])

    def __call__(self, D, url, html):
        urls = []
        if url == DOMAIN:
            # crawl the index pages
            urls = [DOMAIN + '/atoz/%s.html' % letter for letter in string.uppercase + '#']
        elif re.search('/atoz/\w\.html', url):
            # crawl the categories
            urls = [DOMAIN + link for link in xpath.search(html, '//div[@id="partitionContainer"]//a/@href')]
        elif re.search('/atoz/\w/\d+\.html', url):
            # crawl the businesses
            urls = [DOMAIN + link for link in xpath.search(html, '//div[@id="listingsContainer"]//a/@href')]
        else:
            # scrape business details
            name = xpath.get(html, '//h1[@class="listingName"]')
            address = xpath.get(html, '//span[@class="listingAddressText"]')
            row = name, address
            self.writer.writerow(row)
        return urls

download.threaded_get(url=DOMAIN, proxies=proxies, cb=BusinessDirectory())

Daily deal threaded scraper

Scrape all deals from a popular daily deal website:

import re
import csv
import urlparse
from webscraping import common, download, xpath


DOMAIN = ...
writer = csv.writer(open('daily_deals.csv', 'w'))
writer.writerow(['Company', 'Address', 'Website', 'Email'])

def daily_deal(D, url, html):
    """This callback is called after each download
    """
    if url == DOMAIN:
        # first download - get all the city deal pages
        links = [link.replace('/deals/', '/all-deals/') for link in xpath.search(html, '//a[@class="jCityLink"]/@href')]
    elif '/all-deals/' in url:
        # city page downloaded - get all the deals
        links = re.findall('"dealPermaLink":"(.*?)"', html)
    else:
        # deal page downloaded - extract the details
        company = xpath.get(html, '//div[@class="merchantContact"]/h2')
        website = xpath.get(html, '//div[@class="merchantContact"]/a/@href')
        address = common.unescape(xpath.get(html, '//div[@class="merchantContact"]/text()')).replace('Returns:', '').strip()
        if website:
            # crawl website for contact email
            email = '\n'.join(D.get_emails(website))
        else:
            email = None
        row = company, address, website, email
        # write deal details to CSV
        writer.writerow(row)
        links = []

    return [urlparse.urljoin(DOMAIN, link) for link in links]

# start the crawler
download.threaded_get(url=DOMAIN, proxy_file='proxies.txt', cb=daily_deal, num_retries=1)

Reference

adt Module

alg Module

common Module

download Module

pdict Module

webkit Module

xpath Module