[mod_python] Mapping a URI to pages dynamically.

Mon May 19 15:59:32 EST 2003

Sorry for not replying earlier... Here is the code, we use...
I have removed pieces of the code which deal with other stuff.
I hope I have not removed too many things...
This code is due to Dustin Mitchell who setup our site
initially.

We use translate.py as our translate handler, which as you can
see creates a req.url, which has lots of information about the
requested url. The check_dont_handle function, allows us to not
handle certain file types or certain subtrees of the DOCROOT.

Hope you find the code useful.

- Murali

------ translate.py ------------
from mod_python import apache, util
from utils import urls
import os, string, config

# This module is responsible for translating the URL space as designed
# into a filename, which Apache will use to control further processing
# of the request.
#
# It does lots of other things too, basically to process the request
# into a nice, Python-ish format.

def transhandler(req):
   # First check if we're configured not to handle this request.
   check_dont_handle(req)

   # Otherwise begin handling this normally
   try:
     # We use the URL class to calculate all of the parts of the URL
     # we were given
     req.url = urls.URL(req)

     # Check if this is a secure connection
     req.secure = is_secure(req)

     # Copy some important values into req
     req.filename, req.modpath = req.url.filename, req.url.modpath

     # Grab any relevant form data,
     req.form_data = util.FieldStorage(req, keep_blank_values=1, 
strict_parsing=1)

     # and register the next handler
     req.add_handler("PythonHandler", "handlers.dispatch")

   # And handle any errors during that time by logging and sending the
   # usual Apache 404 message.
   except:
     return apache.HTTP_NOT_FOUND

   return apache.OK

def check_dont_handle(req):
   """Supports the PythonOption DontHandle Apache configuration option:

PythonOption DontHandle initial-uri[:initial-uri[:initial-uri...]]

which causes the Python site to decline to handle any requests beginning
with one of the specified initial-uri's.  In that case, normal Apache
behavior will apply.
"""
   options = req.get_options()
   uri = req.uri
   if options.has_key('DontHandle'):
     dont_handle = string.split(options['DontHandle'], ':')
     for prefix in dont_handle:
       if prefix == uri[:len(prefix)]:
         raise apache.SERVER_RETURN, apache.DECLINED

def is_secure(req):
   # This seems the best way to do this
   return req.server.port == 443

-----end of translate.py ---------

------ urls.py ---------------

import config
from mod_python import util
import os, string, copy
import re

# This module is responsible for translating the URL space as designed
# into its components:
#
# http://py.cs.uchicago.edu/path/to/program/args1/args2?internal+data
# ----   ------------------ --------------- ----------- -------------
# |      server             \ script_path   \ args_path \ internal
# \ scheme

# All of these fields are member variables of the URL class. Further,
# args_path is available in list form as url.arguments.
# url.internal is a dictionary representing the value of internal as
# a standard URL query string.  url.special contains the characters
# from internal if it contains no '=', and is empty otherwise.

# the module sets the following fields in req:
#  filename -- the filename of the file which will produce the final
#     response.  Apache cares about this.
#  modpath -- a list of nested Python modules which will succeed in
#     representing the script.
#
# It's important to note that the latter two options define the file
# which will produce the final page, while the annotations on the URL
# are produced regardless of the Python program that will eventually
# produce the data.
#
# In particular, Apache is depending on this module to provide
# req.filename.  Apache config commands (e.g. <Directory>) will look
# at this result to determine what to do next.  To help out down the
# road, we also keep track of a module path to get to the relevant .py
# file, and the portion of the URI which specified that file.

# Python script files are:
#   'docroot'/'script_path'.py
# or
#   'docroot'/'script_path'/index.py
# And this program will find the longest possible match among those
# possibilities.

class URL:
   "Object to represent the URL for a given hit on the site."

   def __init__(self, req):
     # get the scheme (e.g., http or https)
     if req.server.port == 443:
       self.scheme = 'https'
     else:
       self.scheme = 'http'

     # Get the method (e.g., 'GET', 'POST', 'HEAD')
     self.method = req.method

     # server name (just to be safe)
     self.server = req.server.server_hostname

     # parse the path, finding the longest part that still refers to a
     # Python script.
     self._parse_path(req.uri)

   # ----
   # Helper functions

   def _parse_path(self, path):
     # Strip that initial slash
     path = path[1:]

     # Break the path into components
     components = []
     if path:
       components = string.split(path, '/')

     # We operate on 4-tuples:
     current = (
       config.docroot,                   # filename
       config.docmod,                    # modpath
       "/",                              # script_path
       components )                      # arguments
     FILENAME = 0
     MODPATH = 1
     SCRIPT_PATH = 2
     ARGUMENTS = 3

     best = None

     # Loop over those components
     while 1:
       # Try two variations of current, in order of length.
       # 'filename'.py
       fn = current[FILENAME] + '.py'
       if self._intree(fn, config.docroot) and \
          os.path.isfile(fn):
         best = ( fn, current[1], current[2], current[3] )

       # If it's a directory we can try 'filename'/index.py
       if os.path.isdir(current[FILENAME]):
         fn = os.path.join(current[FILENAME], 'index.py')
         if self._intree(fn, config.docroot) and \
            os.path.isfile(fn):
           best = ( fn, current[1] + ( 'index', ),
                    current[2], current[3] )

         # And since it's a directory, we can try the next level of
         # directory nesting
         if current[ARGUMENTS]:
           # Get the first component (that used to be an argument)
           comp = current[ARGUMENTS][0]
           # Break out on any funny business
           if comp == '..' or comp == '.' or comp == '':
             break

           # Ignore '.php', for compatibility
           if comp[-4:] == '.php':
             comp = comp[:-4]

           # Move to a new 'current'
           current = (
             os.path.join(current[FILENAME], comp),
             current[MODPATH] + ( comp, ),
             os.path.join(current[SCRIPT_PATH], comp),
             current[ARGUMENTS][1:] )
           continue

       break

     ( self.filename, self.modpath,
       self.script_path, self.arguments ) = best

     # and touch up the arguments a little bit...
     # filter out any empty elements
     self.arguments = filter(None, self.arguments)
     # and include a slash in the args_path only if necessary
     if self.script_path[-1] == '/':
       self.args_path = string.join(self.arguments, '/')
     else:
       self.args_path = '/' + string.join(self.arguments, '/')

   # Returns boolean indicating if PATH is in the directory tree rooted
   # at ROOT.  Also checks that PATH is absolute, and invariant under
   # normpath(), catching things like '../' and './'.  This class uses
   # this function as a sanity check.
   def _intree(self, path, root):
     return path == os.path.abspath(path) and \
            path == os.path.normpath(path) and \
            path[:len(root)] == root

---- end of urls.py -----