|
Blake Householder
blake8086 at gmail.com
Thu Apr 6 22:22:16 EDT 2006
I have a bit of code that's trying to access google from a mod_python
request handler, and it crashes Apache when it reaches the line
response = opener.open(self.url)
Is it possible to connect to external sites from Apache in this manner, or
should I just give up and use PHP?
import httplib, profile, pstats, re, sys, thread, threading, urllib2
from mod_python import apache
from BeautifulSoup import BeautifulSoup
MAXCONNECTIONS = 2
MAXRESULTS = 10
class PageGrabber(threading.Thread):
connectionPool = threading.Semaphore(MAXCONNECTIONS)
outputSemaphore = threading.Semaphore()
def __init__(self, url, req):
self.url = url
self.req = req
threading.Thread.__init__(self)
def run(self):
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
try:
PageGrabber.connectionPool.acquire()
response = opener.open(self.url)
PageGrabber.connectionPool.release()
soup = BeautifulSoup(response)
PageGrabber.outputSemaphore.acquire()
#strip google cache header stuff
cutPage = re.split('<hr>|<hr />', str(soup), 1)
strippedPage = cutPage[1]
#split into words
wordList = re.split('\t|\n|
|<.*?>|\r|<!--|-->| |,|"|<|>|\?|!|:|\*|\.|\[|\]|\(|\)|\'|\-|"|=|/|<|>|\+|&|@|#',
str(strippedPage))
#print only nonzero words
for word in wordList:
if len(word) > 0:
self.req.write(word + ' / ')
PageGrabber.outputSemaphore.release()
except Exception:
print 'timed out'
return
def main(req):
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
searchTerms = ['python']
if len(searchTerms) > 1:
searchString = '+'.join(searchTerms)
else:
searchString = searchTerms[0]
response = opener.open('http://www.google.com/search?num=' +
str(MAXRESULTS) +'&q=' + searchString)
soup = BeautifulSoup(response)
cachedURLs = []
#extract cache urls
for result in soup('a'):
if re.search('q=cache', result['href']):
cachedURLs.append(result['href'])
#start threads to pull down cached data
for cachedURL in cachedURLs:
#use google's image strip thingie
stripURL = cachedURL + '&strip=1'
PageGrabber(stripURL, req).start()
def handler(req):
req.content_type = 'text/plain'
main(req)
return apache.OK
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://mm_cfg_has_not_been_edited_to_set_host_domains/pipermail/mod_python/attachments/20060406/9a338a17/attachment.html
|