#!/usr/bin/env python # # bheap.py - class to manage bitheap database # # Version 1.2 # # Written by Jim Wiggins, with help from Jon Gross and John Labovitz # # TODO: --info command # reconstruct portions of db # list of hashes to ignore # list of hash collisions to ignore # GUI import traceback, sys, os, pg, md5, getpass ESQ = lambda s: s.replace("'", "''") class bheap(object): VERSION = 1.2 BLKSZ = 8192 # Global attributes: # conn # schema_flag # root_dir ############################################## # Creation and destruction of a class object # ############################################## # Connect to database, test for existence of schema def __init__(self, user=None, dbname=None): # default to current user and db named after them if not user: user = getpass.getuser() if not dbname: dbname = user # connect to database host = None port = -1 opt = None tty = None pwd = None self.conn = pg.connect(dbname, host, port, opt, tty, user, pwd) self.schema_flag = self.schema_exists() if self.schema_flag: self.root_dir = self.root() else: self.root_dir = None # It's best to call this, since __del__ might not happen def close(self): if self.conn: self.conn.close() self.conn = None # Try to clean up after sloppy user def __del__(self): self.close() ########################################## # Creation and destruction of the schema # ########################################## def schema_create(self, dbpath): q = self.conn.query( \ "CREATE TABLE bheap_setup ( " "version VARCHAR, " "dbpath VARCHAR)") q = self.conn.query( \ "CREATE TABLE bheap ( " "pkey INTEGER PRIMARY KEY, " "md5sum CHAR(32), " "path VARCHAR)") q = self.conn.query("CREATE SEQUENCE bheap_pkey_seq") q = self.conn.query("CREATE INDEX bheap_md5sum_index ON bheap(md5sum)") self.schema_flag = True # some initial values dbpath = os.path.abspath(dbpath) q = self.conn.query( \ "INSERT INTO bheap_setup (version, dbpath) " "VALUES ('%s', '%s')" % (self.VERSION, dbpath)) self.root_dir = dbpath def schema_die_die_die(self): try: q = self.conn.query("DROP INDEX bheap_md5sum_index") except: pass try: q = self.conn.query("DROP SEQUENCE bheap_pkey_seq") except: pass try: q = self.conn.query("DROP TABLE bheap") except: pass try: q = self.conn.query("DROP TABLE bheap_setup") except: pass self.schema_flag = False ################## # Schema updates # ################## # Insert a new entry into database def insert(self, hash, path): if self.schema_flag: q = self.conn.query("SELECT nextval('bheap_pkey_seq')") result = q.getresult() if result: (next_pkey,) = result[0] q = self.conn.query( \ "INSERT INTO bheap (pkey, md5sum, path) " "VALUES (%d, '%s', '%s')" % (next_pkey, hash, ESQ(path))) ################## # Schema queries # ################## # See if the tables are there def schema_exists(self): q = self.conn.query( \ "SELECT count(*) FROM pg_tables WHERE tablename = 'bheap_setup'") result = q.getresult() (count,) = result[0] if count == 0: return False q = self.conn.query( \ "SELECT count(*) FROM pg_tables WHERE tablename = 'bheap'") result = q.getresult() (count,) = result[0] if count == 0: return False return True # Return the version of the bitheap def version(self): version = None if self.schema_flag: q = self.conn.query("SELECT version FROM bheap_setup") result = q.getresult() if result: (version,) = result[0] return version # Return the directory root of the bitheap def root(self): root = None if self.schema_flag: q = self.conn.query("SELECT dbpath FROM bheap_setup") result = q.getresult() if result: (root,) = result[0] if root == '': root = None return root # Return count of rows def size(self): if self.schema_flag: q = self.conn.query("SELECT count(*) FROM bheap") result = q.getresult() (count,) = result[0] else: count = 0 return count # Return list of matching bitheap files def matches(self, hash): mlist = [] if self.schema_flag: q = self.conn.query( \ "SELECT * FROM bheap WHERE md5sum = '%s'" % ESQ(hash)) for elem in q.getresult(): mlist.append(elem[2]) return mlist # Say if given path is novel or known def known(self, path): path = os.path.abspath(path) (hash, bytes) = self.md5sum(path) if not hash: return False mlist = self.matches(hash) return len(mlist) > 0 # Dump all contents def dump(self): if self.schema_flag: self.report('Version %s' % self.version()) self.report('Root = %s' % self.root()) self.report('%d entries' % self.size()) q = self.conn.query( "SELECT md5sum, path FROM bheap " "ORDER BY md5sum, path") for elem in q.getresult(): self.report('%s %s' % (elem[0], elem[1])) ####################### # More involved stuff # ####################### # Import a new directory (that exists under root) recursively into database def import_dir(self, impath): # make sure things exist if not self.schema_flag or not self.root_dir: self.report('ERROR: bitheap not initialized') return # make sure new dir lies under root impath = os.path.abspath(impath) if not impath.find(self.root_dir) == 0: self.error('%s does not lie under %s' % (impath, self.root_dir)) return self.report('Importing %s...' % impath) count = 0 link_found = False for dirpath, dirnames, filenames in os.walk(impath): for file in filenames: filepath = os.path.join(dirpath, file) if os.path.islink(filepath): link_found = True continue (hash, bytes) = self.md5sum(filepath) if not hash == None: relpath = filepath.replace(self.root_dir, '.', 1) self.insert(hash, relpath) count = count + 1 if (count % 100) == 0: self.report('.', False) if (count % 1000) == 0: self.report('%d' % count) if link_found: self.report('WARNING: symlink(s) found') self.report('%d files imported into bitheap' % count) # Report duplicates within the bitheap def dupes(self): # make sure things exist if not self.schema_flag or not self.root_dir: self.report('ERROR: bitheap not initialized') return q = self.conn.query( "SELECT md5sum, path FROM bheap " "WHERE md5sum IN (" " SELECT md5sum FROM bheap " " GROUP BY md5sum HAVING COUNT(md5sum) > 1) " "ORDER BY md5sum") hash = '' for elem in q.getresult(): if not elem[0] == hash: self.report('') hash = elem[0] self.report('%s' % elem[1]) # Compare files below given directory to bitheap def analyze(self, path): # make sure things exist if not self.schema_flag or not self.root_dir: self.report('ERROR: bitheap not initialized') return if os.path.isfile(path): if self.known(path): self.report(' %s' % path) else: self.report('NEW %s' % path) elif os.path.isdir(path): for dirpath, dirnames, filenames in os.walk(path): filenames.sort() for file in filenames: filepath = os.path.join(dirpath, file) if self.known(filepath): self.report(' %s' % filepath) else: self.report('NEW %s' % filepath) # Return list of novel (or known) files below given directory def novel(self, path, sense): # make sure things exist if not self.schema_flag or not self.root_dir: self.report('ERROR: bitheap not initialized') return if os.path.isfile(path): if not self.known(path) == sense: self.report('%s' % path) else: for dirpath, dirnames, filenames in os.walk(path): filenames.sort() for file in filenames: filepath = os.path.join(dirpath, file) (hash, bytes) = self.md5sum(filepath) if not hash == None: mlist = self.matches(hash) if (len(mlist) == 0) == sense: self.report('%s' % filepath) # Print the deepest subdirectories containing novelty def search(self, path): # make sure things exist if not self.schema_flag or not self.root_dir: self.report('ERROR: bitheap not initialized') return if os.path.isfile(path): if not self.known(path): self.report('%s' % path) else: novel = {} for dirpath, dirnames, filenames in os.walk(path, topdown=False): # start by assuming there's nothing new here novel[dirpath] = False # check subdirs for novelty for dir in dirnames: subpath = os.path.join(dirpath, dir) if os.path.islink(subpath): continue if novel[subpath]: novel[dirpath] = True break if novel[dirpath]: continue # check files for novelty for file in filenames: filepath = os.path.join(dirpath, file) (hash, bytes) = self.md5sum(filepath) if not hash == None: if len(self.matches(hash)) == 0: novel[dirpath] = True break if novel[dirpath]: self.report('%s' % dirpath) ############# # Utilities # ############# # Output a notice def report(self, str, newline=True): if newline: print str else: sys.stdout.write(str) sys.stdout.flush() # Output an error def error(self, str): print 'ERROR: %s' % str # Compute the MD5 checksum for a file. # Return the checksum and total number of bytes read, (None,0) if error. def md5sum(self, filename): m = md5.new() b = 0 try: f = file(filename, "rb") except: self.error('Cannot open "%s"' % filename) return None, 0 while 1: data = f.read(self.BLKSZ) if not data: break m.update(data) b += len(data) f.close() return m.hexdigest(), b def usage(): progname = sys.argv[0].split('/')[-1] print 'Usage:' print ' %s | test current dir against db' % \ progname print ' %s | test files, dirs against db' % \ progname print ' %s --help | get help' % progname print ' %s --dump | dump entire db' % progname print ' %s --dupes | shows duplicates in db' % progname print ' %s --destroy | destroys the db' % progname print ' %s --novel | displays only novel files' % \ progname print ' %s --known | displays only known files' % \ progname print ' %s --search | displays boundary of novel/known' %\ progname print ' %s --import | adds new data to db' % progname print ' %s --rebuild | creates the db' % progname def main(): user = getpass.getuser() dbname = user try: bh = bheap(user, dbname) except: traceback.print_exc(20, sys.stdout) print "Cannot connect to database %s as %s" % (dbname, user) print "Pre-requisite steps:" print "* Install PostgreSQL and PyGreSQL packages" print "* As user 'postgres', to set up for 'username', do:" print " $ createuser " print " $ createdb -O " return # special case for --rebuild if len(sys.argv) == 3 and sys.argv[1] == '--rebuild': path = sys.argv[2] if not os.path.isdir(os.path.abspath(path)): print 'Whatcha trying to do, buddy?' usage() else: print 'Rebuilding from %s' % path bh.schema_die_die_die() bh.schema_create(path) bh.import_dir(path) elif bh.schema_exists(): # no arguments means analyze current directory if len(sys.argv) <= 1: bh.analyze('.') # if first arg is not command, then all args are files/dirs elif not sys.argv[1].find('--') == 0: for arg in sys.argv[1:]: bh.analyze(arg) # first arg is command: parse further elif sys.argv[1] == '--help': print 'The first thing to do is use the "--rebuild" command to' print 'create your database. Then...' usage() elif sys.argv[1] == '--dump': bh.dump() elif sys.argv[1] == '--dupes': bh.dupes() elif sys.argv[1] == '--destroy': bh.schema_die_die_die() elif sys.argv[1] == '--novel': for arg in sys.argv[2:]: bh.novel(arg, True) elif sys.argv[1] == '--known': for arg in sys.argv[2:]: bh.novel(arg, False) elif sys.argv[1] == '--search': for arg in sys.argv[2:]: bh.search(arg) elif sys.argv[1] == '--import': for arg in sys.argv[2:]: bh.import_dir(arg) else: usage() else: print 'You must first --rebuild' usage() bh.close() if __name__ == '__main__': try: main() except: traceback.print_exc(20, sys.stdout)