Blake Householder
blake8086 at gmail.com
Thu Apr 6 22:22:16 EDT 2006
I have a bit of code that's trying to access google from a mod_python request handler, and it crashes Apache when it reaches the line response = opener.open(self.url) Is it possible to connect to external sites from Apache in this manner, or should I just give up and use PHP? import httplib, profile, pstats, re, sys, thread, threading, urllib2 from mod_python import apache from BeautifulSoup import BeautifulSoup MAXCONNECTIONS = 2 MAXRESULTS = 10 class PageGrabber(threading.Thread): connectionPool = threading.Semaphore(MAXCONNECTIONS) outputSemaphore = threading.Semaphore() def __init__(self, url, req): self.url = url self.req = req threading.Thread.__init__(self) def run(self): opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] try: PageGrabber.connectionPool.acquire() response = opener.open(self.url) PageGrabber.connectionPool.release() soup = BeautifulSoup(response) PageGrabber.outputSemaphore.acquire() #strip google cache header stuff cutPage = re.split('<hr>|<hr />', str(soup), 1) strippedPage = cutPage[1] #split into words wordList = re.split('\t|\n| |<.*?>|\r|<!--|-->| |,|"|<|>|\?|!|:|\*|\.|\[|\]|\(|\)|\'|\-|"|=|/|<|>|\+|&|@|#', str(strippedPage)) #print only nonzero words for word in wordList: if len(word) > 0: self.req.write(word + ' / ') PageGrabber.outputSemaphore.release() except Exception: print 'timed out' return def main(req): opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] searchTerms = ['python'] if len(searchTerms) > 1: searchString = '+'.join(searchTerms) else: searchString = searchTerms[0] response = opener.open('http://www.google.com/search?num=' + str(MAXRESULTS) +'&q=' + searchString) soup = BeautifulSoup(response) cachedURLs = [] #extract cache urls for result in soup('a'): if re.search('q=cache', result['href']): cachedURLs.append(result['href']) #start threads to pull down cached data for cachedURL in cachedURLs: #use google's image strip thingie stripURL = cachedURL + '&strip=1' PageGrabber(stripURL, req).start() def handler(req): req.content_type = 'text/plain' main(req) return apache.OK -------------- next part -------------- An HTML attachment was scrubbed... URL: http://mm_cfg_has_not_been_edited_to_set_host_domains/pipermail/mod_python/attachments/20060406/9a338a17/attachment.html
|