[mod_python] External connections in mod_python

Thu Apr 6 22:22:16 EDT 2006

I have a bit of code that's trying to access google from a mod_python
request handler, and it crashes Apache when it reaches the line

response = opener.open(self.url)

Is it possible to connect to external sites from Apache in this manner, or
should I just give up and use PHP?

import httplib, profile, pstats, re, sys, thread, threading, urllib2
from mod_python import apache
from BeautifulSoup import BeautifulSoup

MAXCONNECTIONS = 2
MAXRESULTS = 10

class PageGrabber(threading.Thread):
    connectionPool = threading.Semaphore(MAXCONNECTIONS)
    outputSemaphore = threading.Semaphore()

    def __init__(self, url, req):
        self.url = url
        self.req = req
        threading.Thread.__init__(self)

    def run(self):
        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', 'Mozilla/5.0')]
        try:
            PageGrabber.connectionPool.acquire()
            response = opener.open(self.url)
            PageGrabber.connectionPool.release()
            soup = BeautifulSoup(response)
            PageGrabber.outputSemaphore.acquire()
            #strip google cache header stuff
            cutPage = re.split('<hr>|<hr />', str(soup), 1)
            strippedPage = cutPage[1]
            #split into words
            wordList = re.split('\t|\n|
|<.*?>|\r|<!--|-->|&nbsp;|,|&quot;|&lt;|&gt;|\?|!|:|\*|\.|\[|\]|\(|\)|\'|\-|"|=|/|<|>|\+|&|@|#',
str(strippedPage))
            #print only nonzero words
            for word in wordList:
                if len(word) > 0:
                    self.req.write(word + ' / ')
            PageGrabber.outputSemaphore.release()
        except Exception:
            print 'timed out'
        return

def main(req):
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    searchTerms = ['python']
    if len(searchTerms) > 1:
        searchString = '+'.join(searchTerms)
    else:
        searchString = searchTerms[0]
    response = opener.open('http://www.google.com/search?num=' +
str(MAXRESULTS) +'&q=' + searchString)

    soup = BeautifulSoup(response)
    cachedURLs = []
    #extract cache urls
    for result in soup('a'):
        if re.search('q=cache', result['href']):
            cachedURLs.append(result['href'])

    #start threads to pull down cached data
    for cachedURL in cachedURLs:
        #use google's image strip thingie
        stripURL = cachedURL + '&strip=1'
        PageGrabber(stripURL, req).start()

def handler(req):
    req.content_type = 'text/plain'
    main(req)
    return apache.OK
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://mm_cfg_has_not_been_edited_to_set_host_domains/pipermail/mod_python/attachments/20060406/9a338a17/attachment.html