#!/usr/bin/python
#: gen-sitemap : generate sitemap.xml file
#
#  Copyright (c) 2007  Giacomo A. Catenazzi <cate@cateee.net>
#  This is free software, see GNU General Public License v2 for details

import sys, time, datetime, os, os.path, re, optparse, gzip, urllib
import xml.sax.saxutils


# global settings

pings   = []

baseurl  = "http://localhost/"
basepath = "./"
outputs  = ("sitemap.xml.gz",)
out_path = basepath
out_url  = baseurl

# Command line arguments

parser = optparse.OptionParser()
parser.set_defaults(verbose=1, notify=False, run=False, console=False,
	conf="sitemap.conf")
parser.add_option("-q", "--quiet", dest="verbose",
                    action="store_const", const=0,
                    help="don't print warning messages")
parser.add_option("-v", "--verbose", dest="verbose",
                    action="count",
                    help="")
parser.add_option("-c", "--conf", dest="conf",
                    action="store", type="string",
                    help="FILE with the configuration", metavar="FILE")
parser.add_option("-o", "--stdout", dest="console",
                    action="store_true",
                    help="Write the output to standard output insteat of sitemap files")
parser.add_option("-n", "--notify", dest="notify",
                    action="store_true",
                    help="send notice to web engines (according ping option)")
parser.add_option("-r", "--allow-run", dest="run",
                    action="store_true",
                    help="run scripts in 'run:' commands")

(options, args) = parser.parse_args()

if len(args) > 0:
    parser.error("arguments are not needed (use only options)")
    sys.exit(1)


# Generic support funcions

def log(string, level=1):
    if options.verbose >= level:
	sys.stdout.write(string+"\n")

def error(string):
    if options.verbose > 0:
        sys.stderr.write(string+"\n")

def die(string):
     sys.stderr.write(string+"\n")
     sys.exit(1)

iso8601_time_re = re.compile(r"[12][0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]T[0-2][0-9]:[0-6][0-9]:[0-6][0-9]Z")

def iso8601_time(date):
    "read ISO 8601 times, converting it into '%Y-%m-%dT%H:%M:%SZ'"
    # shortcut
    if iso8601_time_re.match(date):
	return date
    i = date.replace(" ", "T").find("T")
    if i >= 0:
        d = date[:i]
        t = date[i+1:]
    else:
        d = date
        t = None
    d = d.replace("-", "")
    s = len(d)
    if d.find("W") >= 0:
	# really should be "G", but not yet implemented
        fmt = {7:"%GW%V", 8:"%GW%V%u"}.get(s, None)
	log("ISO 8601 dates with weeks (W) not yet implemented in the libraries")
	return None
    else:
        fmt = {4:"%Y", 6:"%Y%m", 7:"%Y%j", 8:"%Y%m%d"}.get(s, None)
    if not fmt:
        return None
    tz = 0
    if t:
        t = t.replace(":", "")
        s = len(t)
        i = 0
        while(i<s  and  t[i].isdigit()):
            i += 1
        fmt2 = {2:"T%H", 4:"T%H%M", 6:"T%H%M%S"}.get(i, None)
        if not fmt2:
            return None
        fmt += fmt2
        d += "T" + t[:i]
        if i < s  and  ( t[i] == "." or  t[i] == "," ):
            while(i<s  and  not t[i].isdigit()):
                i += 1
        if s == i:
            tz = time.timezone
        elif s == i+1  and  t[i] == "Z":
            tz = 0
        elif t[i] == "-"  or  t[i] == "+":
            if s == i+3:
                tz = int(t[i+1:i+3]) * 3600
            elif s == i+5:
                tz = int(t[i+1:i+3]) * 3600 + int(t[i+3:i+5]) * 60
            else:
                return None
            if t[i] == "+":
                tz = -tz
    try:
        tm = time.strptime(d, fmt)
    except ValueError:
        return None
    t = time.mktime(tm) + tz - time.timezone
    return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(t))


def split_str(string, sep=" \t"):
    "split string, similar to string.split, but it understand quotes"
    ret = []
    sub = ""
    in_str = False
    string = string.strip(sep)
    mx = len(string)
    i = 0
    while(i<mx):
	c = string[i]
	i += 1
	if c == '"':
	    in_str = not in_str
	elif in_str:
	    if c == '\\' and i < mx:
		sub += string[i]
		i += 1
	    else:
		sub += c
	elif c in sep:
	    ret.append(sub)
	    sub = ""
	    while(i < mx  and  string[i] in sep):
		i += 1
	else:
	    sub += c
    if sub:
	ret.append(sub)
    return ret

def unmask(string):
    "transform mask into a regexp"
    mx = len(string)
    if mx > 2  and  string[0] == '"'  and  string[-1] == '"':
	string = string[1:-1]
    ret = ""
    i = 0
    while(i < mx):
	c = string[i]
	i += 1
	if c == '\\' and i < mx:
	    ret += string[i]
	    i += 1
	elif c == '*':
	    ret += '\0'
	else:
	    ret += c
    mask = re.escape(ret).replace("\\000\\000", ".*").replace("\\000", "[^/]*")
    if mask[-1] == "/":
	return mask
    else:
	return mask + '$'

# filters

def filter_shortcut(filename, filters):
    "check the directory filters, not to parse all files in that dir"
    filename += "/"
    for flt, incl, opts in filters:
        if not flt[-1] == '/':
            return True
	if incl:
	    part_flt = ""
	    for d in flt.split("/")[:-1]:
	        part_flt += d + "/"
                if re.match(part_flt, filename):
                    return True
	else:
	    if re.match(flt, filename):
		return False
    return True

def filter_apply(filename, filters):
    "apply filters to filename and ev. add to sitemap"
    op = {}
    found = False
    found_dir = False
    for flt, incl, opts in filters:
        if re.match(flt, filename):
            if incl:
                if not flt[-1] == '/':
                    found = True
		elif not found_dir:
		    found_dir = True
                complete_options(op, opts)
            elif not found:
                # first match is negative
		if not found_dir:
                    return
    if op.get('setlastmod', '') == "true":
        t = time.gmtime(os.path.getmtime(os.path.join(basepath, filename)))
        op['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%SZ", t)
    dir, name = os.path.split(filename)
    if name == op.get('index', ""):
        map.add(dir+"/", op)
    else:
        map.add(filename, op)

filters = []


# options

def parse_options(args, linenumber, opt_class, start=2):
    "parse arguments returning a dictionary of the options)"
    ret = {}
    for arg in args[start:]:
        exp = arg.split("=", 1)
        if len(exp) != 2:
            error_args('%s in %s' % (arg, args[0]), linenumber)
        elif exp[0] in opt_class:
            v = sanitize_option(exp[0], exp[1])
            if v:
                ret[exp[0]] = v
            else:
                log("invalid value of '%s' in '%s' at line %s" %
			(exp[0], args[0], linenumber))
        else:
            log("ignoring unknow parameter '%s' in '%s' at line %s" %
                (exp[0], args[0], linenumber))
    return ret

def sanitize_option(option, value):
    "Check the format of standard options"
    if option == "priority":
        try:
            return "%.1f" % float(value)
        except ValueError:
	    log("ignoring invalid value of 'priority': '%s" % value)
            return None
    elif option == "changefreq":
        if value not in changefreq:
	    log("ignoring invalid value of 'changefreq': %s" % value)
            return None
    elif option == "lastmod":
	value = iso8601_time(value)
	if not value:
	    log("ignoring invalid value of 'lastmod': %s" % value)
    return value

def complete_options(opt, opt2):
    "complete missing items in opt with those in opt2"
    for k,v in opt2.iteritems():
        if not opt.has_key(k):
            opt[k] = v

def adapt_options(opt, adapt_table):
    "handle adapt options"
    for k,va in adapt_table.iteritems():
        v = opt.get(k, None)
        if k == "a-priority":
            if va[0] == '0'  or  va[0] == '1':
                opt[k] = va
            elif v != None  and  (va[0] == "-"  or  va[0] == "+"):
                r = min(1.0, max(0.0, float(v) + float(va)))
                opt[k] = "%.1f" % r
        elif k == "a-changefreq":
            if va in changefreq:
                opt[k] = va
        elif k == "a-lastmod":
            if va[0].isnumber():
                opt[k] = va

# constants

opt_sitemap = ("changefreq", "priority", "lastmod", "class")
opt_path    = ("changefreq", "priority", "lastmod", "class", "index", "setlastmod", "ignore-filters")
opt_add_sm  = ("a-changefreq", "a-priority", "a-lastmod", "class", "baseurl")

changefreq = ("always", "hourly", "daily", "weekly", "monthly", "yearly", "never")

# parse configuration items

def parse_conf(filename):
    f = open(filename)
    linenumber = 0
    for line in f:
	linenumber += 1
        line = line.replace("\0", "").strip()
        if  len(line) == 0  or  line[0] == "#":
            continue
        args = split_str(line, " \t")
        conf.get(args[0], conf_warning)(args, linenumber)
    f.close()

def error_args(option, linenumber):
    error("wrong number of arguments in option '%s' at line %s" %
	(option, linenumber))

def conf_baseurl(args, linenumber):
    global baseurl
    if len(args) != 2:
	error_args(args[0], linenumber)
	return
    baseurl = args[1]
    if baseurl[-1] != '/':
	baseurl += '/'

def conf_basepath(args, linenumber):
    global basepath
    if len(args) != 2:
        error_args(args[0], linenumber)
	return
    basepath = args[1]
    if basepath[-1] != '/':
         basepath += '/'
    if not os.path.isdir(basepath):
	die("Error: basepath '%s' (at line %s)  doesn't point to a directory" %
		(basepath, linenumber))

def conf_sitemap(args, linenumber):
    global outputs, out_path, out_url
    if len(args) < 2  or  len(args) > 3:
        error_args(args[0], linenumber)
        return
    outputs = args[1:]
    out_path = basepath
    out_url  = baseurl

def conf_add_url(args, linenumber):
    if len(args) < 2:
        error_args(args[0], linenumber)
	return
    op = parse_options(args, linenumber, opt_sitemap, 2)
    map.add(args[1], op)

def conf_filter_reset(args, linenumber):
    global filters
    if len(args) != 1:
        error_args(args[0], linenumber)
	return
    filters = []

def conf_filter_ignore(args, linenumber):
    if len(args) != 2:
        error_args(args[0], linenumber)
	return
    filters.append((unmask(args[1]), False, {}))

def conf_filter_add(args, linenumber):
    if len(args) < 2:
        error_args(args[0], linenumber)
	return
    op = parse_options(args, linenumber, opt_path, 2)
    filters.append((unmask(args[1]), True, op))

def parse_dir(filename, filters):
    "recurse into dirs"
    filename = os.path.normpath(filename)
    name = os.path.basename(filename.rstrip("/"))
    # allow single "."
    if len(name) > 1 and  name[0] == ".":
        return
    path = os.path.join(basepath, filename)
    if os.path.isdir(path):
	if not filter_shortcut(filename, filters):
	    return
	try:
	    dirlist = os.listdir(path)
	except EnvironmentError, (errno, strerror):
	    log("ignoring dir %s: system error(%s): %s" %
		(path, errno, strerror))
	    return
	
        for file in dirlist:
            parse_dir(os.path.join(filename, file), filters)
        return
    filter_apply(filename, filters)

def conf_add_dir(args, linenumber):
    if len(args) < 2:
        error_args(args[0], linenumber)
	return
    op = parse_options(args, linenumber, opt_path, 2)

    filename = os.path.normpath(args[1])
    path = os.path.join(basepath, filename)
    if not os.path.exists(path):
	error("ignoring non existing path '%s' in '%s' at line %s" % (
		path, args[0], linenumber))
	return
    if op.get('ignore-filters', "") == 'true':
	parse_dir(args[1], (".*$", True, op))
    else:
        parse_dir(args[1], filters + [(".*$", True, op)])

def conf_include(args, linenumber):
    global filters
    if len(args) < 2:
        error_args(args[0], linenumber)
	return
    op = parse_options(args, linenumber, opt_path, 2)

    filename = os.path.normpath(args[1])
    path = os.path.join(basepath, filename)
    if not os.path.isfile(path):
        error("ignoring non-file path '%s' in '%s' at line %s" % (
                path, args[0], linenumber))
        return
    push_filters = filters
    if op.get('ignore-filters', "") == 'true':
	filters = (".*$", True, op)
    else:
        filters = filters + [(".*$", True, op)]
    parse_conf(path)
    filters = push_filters

def conf_add_list(args, linenumber):
    if len(args) < 2:
        error_args(args[0], linenumber)
	return
    op = parse_options(args, linenumber, opt_path, 2)

    filename = os.path.normpath(args[1])
    path = os.path.join(basepath, filename)
    if not os.path.isfile(path):
        error("ignoring non-file path '%s' in '%s' at line %s" % (
                path, args[0], linenumber))
        return
    if op.get('ignore-filters', "") == 'true':
	filt = [(".*$", True, op)]
    else:
	filt = filters + [(".*$", True, op)]

    f = open(path)
    for line in f:
	line = line.strip()
	if not line:
	    continue
        filename = os.path.normpath(line)
	filter_apply(filename, filt)
    f.close()

xml_re = re.compile(r"<(\w+)[^>]*>(.*?)</\1>", re.DOTALL)

def parse_sitemap(filename, def_opt, map):
    base = def_opt.get("base-url", baseurl)
    path = os.path.join(basepath, filename)
    if filename.endswith(".xml"):
	src = open(path).read()
    elif filename.endswith(".xml.gz"):
	src = gzip.GzipFile(path).read()
    elif filename.endswith(".txt"):
        for line in open(path):
	    line = line[:-1]
	    if line.startswith(base):
		name = line[len(base):]
		opt = {}
		adapt_options(opt, def_opt)
		map.add(name, opt)
	    else:
		log("unknow base url in %s (from a sitemap)" % v)
	return
    else:
	assert 0
	error("unknow sitemap type: %s" % filename)
	return

    if len(outputs) > 1:
	spl = outputs[1].split("*", 1)
	s, e = len(spl[0]), len(spl[1])
        clas = filename[s:-e]
    else:
	clas = "0"
    for attr, val in xml_re.findall(src):
        if attr == "sitemapindex":
	    for attr2, val2 in xml_re.findall(val):
		if attr2 != "sitemap":
		    continue
		for a,v in xml_re.findall(val2):
		    if a == "loc":
			if v.startswith(base):
			    parse_sitemap(v[len(base):], def_opt, map)
			else:
			    log("unknow base url in %s (from a sitemap index)" % v)
			break
	elif attr == "urlset":
	    for attr2, val2 in xml_re.findall(val):
		if attr2 != "url":
		    continue
		opt = {'class': clas}
		name = None
		for a,v in xml_re.findall(val2):
		    if a in opt_sitemap:
			v = sanitize_option(a, v)
			if v:
			    opt[a] = v
		    elif a == "loc":
			if v.startswith(base):
			    name = v[len(base):]
			else:
			    log("unknow base url in %s (from a sitemap)" % v)
			    break
		    else:
			log("ignoring unknow attribute %s (from a sitemap)" % a)
		if name:
		    adapt_options(opt, def_opt)
		    map.add(name, opt)

def conf_add_sitemap(args, linenumber):
    if len(args) < 2:
        error_args(args[0], linenumber)
        return
    op = parse_options(args, linenumber, opt_add_sm, 2)
    parse_sitemap(args[1], op, map)


def conf_run(args, linenumber):
    if len(args) != 2:
        error_args(args[0], linenumber)
        return
    if options.run:
	log("Running %s" % args[1])
	os.system("cd '" + basepath + "' ; " + args[1])
    else:
	log("Non running %s" % args[1])

def conf_ping(args, linenumber):
    if len(args) != 2:
        error_args(args[0], linenumber)
        return
    pings.append(args[1])

def conf_warning(args, linenumber):
    error('unknow option "%s" at line %s' % (args[0], linenumber))


conf = {
    'base-url:': 	conf_baseurl,
    'base-path:':	conf_basepath,
    'sitemap:':		conf_sitemap,
    'add-url:':		conf_add_url,
    'filter-reset:':	conf_filter_reset,
    'filter-ignore:':	conf_filter_ignore,
    'filter-add:':	conf_filter_add,
    'add-dir:':		conf_add_dir,
    'include:':		conf_include,
    'add-list:':	conf_add_list,
    'add-sitemap:':	conf_add_sitemap,

    'run:':		conf_run,
    'ping:':		conf_ping
}


# generate filemap

sitemap_head = """\
<?xml version='1.0' encoding='UTF-8'?>
<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
  http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
  xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
"""
sitemap_tail = "</urlset>\n"

index_head = """\
<?xml version='1.0' encoding='UTF-8'?>
<sitemapindex xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
  http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd"
  xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
"""
index_tail = "</sitemapindex>\n"

class sitemap:
    "sitemap container"

    def __init__(self):
        self.map = {}
	self.index = []
	self.clas = []

    def add(self, url, opt):
	if url == "/":
	    url = ""
	loc = xml.sax.saxutils.escape(out_url + url)
	c = opt.get('class', "0")
	if not self.map.has_key(c):
	    self.map[c] = {}
	    self.clas.append(c)
        self.map[c][loc] = opt

    def add_index(self, clas):
        self.index.append(clas)

    def has_this_item(self, url, clas, opt):
	if not self.map[clas].has_key(url):
	    return False
	sopt = self.map[clas][url]
	for k in ("changefreq", "priority", "lastmod"):
	    if sopt.has_key(k) ^ opt.has_key(k):
		return False
	    if not sopt.has_key(k):
		continue
	    if sopt[k] != opt[k]:
		return False
	return True

    def write_file(self, path, classes):
	urls = 0
	size = len(sitemap_head) + len(sitemap_tail)
	if path == None:
	    f = sys.stdout
        elif path.endswith(".xml.gz"):
            f = gzip.GzipFile(path, "w")
        else:
            f = open(path, "w")
	f.write(sitemap_head)
	for c in classes:
	    items = self.map[c].items()
	    items.sort()
	    for loc, opt in items:
		if loc == "/":
		    loc = ""
		item = "<url><loc>" + loc + "</loc>\n"
		for o, v in opt.iteritems():
		    if o in ("changefreq", "priority", "lastmod"):
			item += "  <" + o + ">" + v + "</" + o + ">\n"
		item += "</url>\n"
		size += len(item)
		urls += 1
		f.write(item)
	f.write(sitemap_tail)
	f.close()
	if size > 10485760:
	    log("warning: sitemap in '%s' has too big (%s > 10 MB)"
		% (path, size))
	if urls > 50000:
	    log("warning: sitemap in '%s' has too much urls (%s > 50,000)"
		% (path, url))

    def write(self):
	path0 = os.path.join(out_path, outputs[0])
	if options.console:
	    self.write(None, self.clas)
	    return
	elif len(outputs) == 1:
	    self.write_file(path0, self.clas)
	    return
        if path0.endswith(".xml.gz"):
            f = gzip.GzipFile(path0, "w")
        else:
            f = open(path0, "w")
	f.write(index_head)
	cs = self.clas + self.index
	cs.sort()
        for c in cs:
            path = os.path.join(out_path, outputs[1].replace("*", c))
            loc = out_url + outputs[1].replace("*", c)
	    if c in self.clas:
                self.write_file(path, (c,))
            t = time.gmtime(os.path.getmtime(path))
            lastmod = time.strftime("%Y-%m-%dT%H:%M:%SZ", t)
            item = "<sitemap><loc>" + loc + "</loc>\n  <lastmod>" + lastmod + "</lastmod>\n</sitemap>\n"
            f.write(item)
        f.write(index_tail)
        f.close()

map = sitemap()

log("parsing %s" % options.conf)
parse_conf(options.conf)

# try to merge existing sitemap
path0 = os.path.join(out_path, outputs[0])
if not options.console  and  len(outputs) > 1  and  os.path.isfile(path0):
    files_diff = {}
    old = sitemap()
    parse_sitemap(outputs[0], {}, old)
    for c in old.clas:
        for k, v in old.map[c].iteritems():
	    if not map.has_this_item(k, c, v):
	        files_diff[c] = True
		break

    mc = map.clas[:]
    for c in mc:
	if files_diff.has_key(c):
	    continue
	# We can use the old sitemap
	map.add_index(c)
	for k,v in old.map.iteritems():
	    if c == v.get('class', ""):
		del map[k]
	map.clas.remove(c)

map.write()

if options.notify:
    for ping in pings:
	log("Notifing %s" % ping)
	f = urllib.urlopen(ping)
	log(f.read(), level=2)
	f.close()
elif pings:
    log("not pinging, because no --notify")

