Xref images
From Noah.org
Click to download: xref_images.py
#!/usr/bin/env python """ SYNOPSIS xref_images.py [-h,--help] [-v,--verbose] [--version] HTDOCS_PATH [IMAGE_PATH] DESCRIPTION This script finds images (.png, .jpg, and .gif) that are not being used in a web site document path. This is useful for finding orphaned images that are no longer needed by any document and for finding broken image links. HTDOCS_PATH argument should point to your htdocs path. For example: /var/www/htdocs. IMAGE_PATH is optional. It defaults to the same as HTDOCS_PATH if not given. For example: /var/www/images Basic algorithm: generate cross-reference of links in HTML files and images in the filesystem. 1.We build a list of file names found in the image path. 2.We build a list of file names extracted from HTML text. 3.After this is done the list are compared to find disjoint names. 4.We build a list of files that are in the path, but not in the HTML. 5.We build a list of files that are in HTML, but not in the path. This is limited because there are many ways the doc and images paths can be convoluted by SSI, virtual hosts, htconf aliases, and other headaches. In other words, this is a crude xref report that will contain errors. This was originally written for Rebecca Winter. EXAMPLES ./xref_images.py /var/www/htdocs ./xref_images.py /var/www/htdocs /var/www/images EXIT STATUS none AUTHOR Noah Spurrier <noah@noah.org> LICENSE This script is in the public domain, free from copyrights or restrictions. VERSION $Id: xref_images.py 139 2007-12-13 15:14:14Z root $ """ import sys, os, traceback, optparse import time import re def extract_images (filename): """This takes the given filename and images list adds all *.png, *.jpg, and *.gif image filenames found in the html file. The names are added to the given list so you can use this to keep track of names over multiple files by calling this subroutine multiple times with the same list. This DOES NOT handle remote image URLs (or rather, it handles them incorrectly). """ try: fin = open (filename, 'r') lines = fin.readlines() fin.close() except Exception, error: print error return 0 filenames_source = [] for line in lines: # The image name could be in a comment or outside an img tag. I don't care. m=re.search (r'(?i)([^: >"\']+?)(\.png|\.jpg|\.gif)', line) if m: name = m.group(1) + m.group(2) if name not in filenames_source: filenames_source.append(name) return filenames_source def process_html_file (html_image_list, dirname, names): """This extracts image filenames from every file that ends in .html This is called from os.path.walk. """ for name in names: if re.match (r'(?i).*?\.html', name): extract_images (os.path.join(dirname, name), html_image_list) def process_html_path (start_path): """This extracts html filenames from each directory in the given html_path.""" filename_list= [] for dirpath, dirnames, filenames in os.walk(start_path): for name in filenames: if re.match (r'(?i)(.+?)(\.html|\.htm|\.css)', name): name = os.path.join(dirpath, name) filename_list.append(name) return filename_list def extract_image_filenames_from_directories (image_path): """This extracts image filenames from each directory in the given image_path.""" filename_list= [] for dirpath, dirnames, filenames in os.walk(image_path): for name in filenames: if re.match (r'(?i)(.+?)(\.png|\.jpg|\.gif)', name): name = os.path.join(dirpath, name) filename_list.append(name) return filename_list #if name not in filesystem_image_list: def normalize_filename (docroot, filename): try: if filename[0] == '/': # absolute pass else: filename = os.path.normpath(os.path.join(docroot, filename)) return filename except: return None def in_A_not_in_B (list_a, list_b): """This takes two lists and returns a list that contains all items that were found in list_a but not found in list_b. """ items_in_a_not_in_list_b = [] for item in list_a: if item not in list_b: items_in_a_not_in_list_b.append(item) return items_in_a_not_in_list_b def extract_filenames_referenced_in_html_list (html_filename_list): image_filename_referenced_list = [] for hfn in html_filename_list: #print hfn,':' for ifn in extract_images(hfn): cwd = os.path.dirname(hfn) filename = normalize_filename (cwd, ifn) #print ' ' + filename if filename not in image_filename_referenced_list: image_filename_referenced_list.append (filename) return image_filename_referenced_list def main (): global options, args HTML_PATH = args[0] if len(args)>1: IMAGE_PATH = args[1] else: IMAGE_PATH = HTML_PATH html_filename_list = process_html_path (HTML_PATH) # This is the list of image filenames that are being referenced in all HTML documents found. html_image_ref_list = extract_filenames_referenced_in_html_list (html_filename_list) # This is the list of image filenames that are actually found in the given path. filesystem_image_list = extract_image_filenames_from_directories (IMAGE_PATH) #print 'All images reference in HTML files:' #print html_image_ref_list #print #print 'All images found in the filesystem:' #print filesystem_image_list #print not_in_fs = in_A_not_in_B (html_image_ref_list, filesystem_image_list) not_in_html = in_A_not_in_B (filesystem_image_list, html_image_ref_list) print 'Files found in HTML, but not found in the file system (these are missing):' for name in not_in_fs: print ' ' + name print print 'Files found in the file system, but not referenced in HTML (these can be removed):' for name in not_in_html: print ' ' + name sys.exit(1) # html_image_list = [] # os.path.walk (HTML_PATH, process_html_file, html_image_list) # html_image_list.sort() # filesystem_image_list.sort() # # print 'List of image file names found in HTML files:' # print html_image_list # print # print 'List of image file names found in the file system:' # print filesystem_image_list # # not_in_html = disjoint_lists (filesystem_image_list, html_image_list) # not_in_fs = disjoint_lists (html_image_list, filesystem_image_list) if __name__ == '__main__': try: start_time = time.time() parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), usage=globals()['__doc__'], version='$Id: xref_images.py 139 2007-12-13 15:14:14Z root $') parser.add_option ('-v', '--verbose', action='store_true', default=False, help='verbose output') (options, args) = parser.parse_args() if len(args) < 1: parser.error ('missing arguments') if len(args) > 2: parser.error ('too many arguments') if options.verbose: print time.asctime() main() if options.verbose: print time.asctime() if options.verbose: print 'TOTAL TIME IN MINUTES:', if options.verbose: print (time.time() - start_time) / 60.0 sys.exit(0) except KeyboardInterrupt, e: # Ctrl-C raise e except SystemExit, e: # sys.exit() raise e except Exception, e: print 'ERROR, UNEXPECTED EXCEPTION' print str(e) traceback.print_exc() os._exit(1)