Xref images - Noah.org

Xref images

From Noah.org

Jump to: navigation, search

Click to download: xref_images.py

#!/usr/bin/env python
 
"""
SYNOPSIS
 
    xref_images.py [-h,--help] [-v,--verbose] [--version] HTDOCS_PATH [IMAGE_PATH]
 
DESCRIPTION
 
    This script finds images (.png, .jpg, and .gif) that are not being used in
    a web site document path. This is useful for finding orphaned images that
    are no longer needed by any document and for finding broken image links.
 
    HTDOCS_PATH argument should point to your htdocs path. For example: /var/www/htdocs.
 
    IMAGE_PATH is optional. It defaults to the same as HTDOCS_PATH if not given.
    For example: /var/www/images
 
    Basic algorithm: generate cross-reference of links in HTML files and
    images in the filesystem.
 
    1.We build a list of file names found in the image path.
    2.We build a list of file names extracted from HTML text.
    3.After this is done the list are compared to find disjoint names.
    4.We build a list of files that are in the path, but not in the HTML.
    5.We build a list of files that are in HTML, but not in the path.
 
    This is limited because there are many ways the doc and images paths can be
    convoluted by SSI, virtual hosts, htconf aliases, and other headaches. In
    other words, this is a crude xref report that will contain errors.
 
    This was originally written for Rebecca Winter.
 
EXAMPLES
 
    ./xref_images.py /var/www/htdocs
    ./xref_images.py /var/www/htdocs /var/www/images
 
EXIT STATUS
 
    none
 
AUTHOR
 
    Noah Spurrier <noah@noah.org>
 
LICENSE
 
    This script is in the public domain, free from copyrights or restrictions.
 
VERSION
 
    $Id: xref_images.py 139 2007-12-13 15:14:14Z root $
"""
 
import sys, os, traceback, optparse
import time
import re
 
def extract_images (filename):
 
    """This takes the given filename and images list adds all *.png, *.jpg, and
    *.gif image filenames found in the html file. The names are added to the
    given list so you can use this to keep track of names over multiple files
    by calling this subroutine multiple times with the same list. This DOES NOT
    handle remote image URLs (or rather, it handles them incorrectly). """
 
    try:
        fin = open (filename, 'r')
        lines = fin.readlines()
        fin.close()
    except Exception, error:
        print error
        return 0
 
    filenames_source = []
    for line in lines:
        # The image name could be in a comment or outside an img tag. I don't care.
        m=re.search (r'(?i)([^: >"\']+?)(\.png|\.jpg|\.gif)', line)
        if m:
            name = m.group(1) + m.group(2)
            if name not in filenames_source:
                filenames_source.append(name)
    return filenames_source
 
def process_html_file (html_image_list, dirname, names):
 
    """This extracts image filenames from every file that ends in .html This is
    called from os.path.walk. """
 
    for name in names:
        if re.match (r'(?i).*?\.html', name):
            extract_images (os.path.join(dirname, name), html_image_list)
 
def process_html_path (start_path):
 
    """This extracts html filenames from each directory in the given
    html_path."""
 
    filename_list= []
    for dirpath, dirnames, filenames in os.walk(start_path):
        for name in filenames:
            if re.match (r'(?i)(.+?)(\.html|\.htm|\.css)', name):
                name = os.path.join(dirpath, name)
                filename_list.append(name)
    return filename_list
 
def extract_image_filenames_from_directories (image_path):
 
    """This extracts image filenames from each directory in the given
    image_path."""
 
    filename_list= []
    for dirpath, dirnames, filenames in os.walk(image_path):
        for name in filenames:
            if re.match (r'(?i)(.+?)(\.png|\.jpg|\.gif)', name):
                name = os.path.join(dirpath, name)
                filename_list.append(name)
    return filename_list
    #if name not in filesystem_image_list:
 
def normalize_filename (docroot, filename):
 
    try:
        if filename[0] == '/': # absolute
            pass
        else:
            filename = os.path.normpath(os.path.join(docroot, filename))
        return filename
    except:
        return None
 
def in_A_not_in_B (list_a, list_b):
 
    """This takes two lists and returns a list that contains all items that
    were found in list_a but not found in list_b. """
 
    items_in_a_not_in_list_b = []
    for item in list_a:
        if item not in list_b:
            items_in_a_not_in_list_b.append(item)
    return items_in_a_not_in_list_b
 
def extract_filenames_referenced_in_html_list (html_filename_list):
 
    image_filename_referenced_list = []
    for hfn in html_filename_list:
        #print hfn,':'
        for ifn in extract_images(hfn):
            cwd = os.path.dirname(hfn)
            filename = normalize_filename (cwd, ifn)
            #print '    ' + filename
            if filename not in image_filename_referenced_list:
                image_filename_referenced_list.append (filename)
    return image_filename_referenced_list
 
def main ():
 
    global options, args
    HTML_PATH = args[0]
    if len(args)>1:
        IMAGE_PATH = args[1]
    else:
        IMAGE_PATH = HTML_PATH
 
    html_filename_list = process_html_path (HTML_PATH)
    # This is the list of image filenames that are being referenced in all HTML documents found.
    html_image_ref_list = extract_filenames_referenced_in_html_list (html_filename_list)
    # This is the list of image filenames that are actually found in the given path.
    filesystem_image_list = extract_image_filenames_from_directories (IMAGE_PATH)
 
    #print 'All images reference in HTML files:'
    #print html_image_ref_list
    #print
    #print 'All images found in the filesystem:'
    #print filesystem_image_list
    #print
 
    not_in_fs = in_A_not_in_B (html_image_ref_list, filesystem_image_list)
    not_in_html = in_A_not_in_B (filesystem_image_list, html_image_ref_list)
    print 'Files found in HTML, but not found in the file system (these are missing):'
    for name in not_in_fs:
        print '    ' + name
    print
    print 'Files found in the file system, but not referenced in HTML (these can be removed):'
    for name in not_in_html:
        print '    ' + name
    sys.exit(1)
 
#    html_image_list = []
#    os.path.walk (HTML_PATH, process_html_file, html_image_list)
#    html_image_list.sort()
#    filesystem_image_list.sort()
#
#    print 'List of image file names found in HTML files:'
#    print html_image_list
#    print
#    print 'List of image file names found in the file system:'
#    print filesystem_image_list
#
#    not_in_html = disjoint_lists (filesystem_image_list, html_image_list)
#    not_in_fs = disjoint_lists (html_image_list, filesystem_image_list)
 
 
if __name__ == '__main__':
    try:
        start_time = time.time()
        parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), usage=globals()['__doc__'], version='$Id: xref_images.py 139 2007-12-13 15:14:14Z root $')
        parser.add_option ('-v', '--verbose', action='store_true', default=False, help='verbose output')
        (options, args) = parser.parse_args()
        if len(args) < 1:
            parser.error ('missing arguments')
        if len(args) > 2:
            parser.error ('too many arguments')
        if options.verbose: print time.asctime()
        main()
        if options.verbose: print time.asctime()
        if options.verbose: print 'TOTAL TIME IN MINUTES:',
        if options.verbose: print (time.time() - start_time) / 60.0
        sys.exit(0)
    except KeyboardInterrupt, e: # Ctrl-C
        raise e
    except SystemExit, e: # sys.exit()
        raise e
    except Exception, e:
        print 'ERROR, UNEXPECTED EXCEPTION'
        print str(e)
        traceback.print_exc()
        os._exit(1)
-->