One of the most important things when you are building a static site generator like Nikola is that your site should not be broken. So, I really should have done this earlier ;-)
This is a very simple link checker that ensures the pages Nikola generates have no broken links. I will make it part of Nikola proper once it's more polished and doit supports getting a list of targets
To try it, get it and run it from the same place where you have your conf.py, right after you run doit.
import os import urllib from urlparse import urlparse import lxml.html def analyze(filename): try: # Use LXML to parse the HTML d = lxml.html.fromstring(open(filename).read()) for l in d.iterlinks(): # Get the target link target = l.attrib[l] if target == "#": # These are always valid continue parsed = urlparse(target) # We only handle relative links. # TODO: check if the URL points to inside the generated # site and check it anyway if parsed.scheme: continue # Ignore the fragment, since the link will still work # TODO: check that the fragment is valid if parsed.fragment: target = target.split('#') # Calculate what file or folder this points to target_filename = os.path.abspath( os.path.join(os.path.dirname(filename), urllib.unquote(target))) # Check if it exists, or report it if not os.path.exists(target_filename): print "In %s broken link: " % filename, target except Exception as exc: # Something bad happened, report print "Error with:", filename, exc # This is hackish: we use doit to get a list of all # generated files. Minor modifications would let you check # the non-generated files as well. for task in os.popen('doit list --all', 'r').readlines(): task = task.strip() if task.split(':') in ( 'render_tags', 'render_archive', 'render_galleries', 'render_indexes', 'render_pages', 'render_site') and '.html' in task: # It looks like a generated HTML file analyze(task.split(":")[-1])