'''walking through a dircectory to find files which satisfy these conditions: - a replica exists at BNL DQ2 LRC - the time elapsed since last touch is greater than ELAPSED_LAST_TOUCH these files will be removed from the directory. please run it - where your SE is locally visible (ls'able), and under the file owner (usatlas1 currently I guess) account. - has to be python 2.3 or higher ''' # Should I take into account the file's ctime, mtime, atime on deletion # What to do about files without a Timestamp # should more data be printed file properties # Print dates as mm/dd/yyyy 24:mm (or Null) # import os,tempfile import sys import time import commands import re import urllib import MySQLdb from xml.dom import minidom from xml.dom.minidom import Document from xml.dom.minidom import parse, parseString from os.path import join, getsize, getatime, getmtime, islink ####################################################################################### # if a file's last touch time is ELAPSED_LAST_TOUCH seconds ago, the file is removable # days: 7 6 5 4 3 2 1 # seconds: 604800 518400 432000 345600 259200 172800 86400 ELAPSED_LAST_TOUCH=432000 SITE_HTTP_ID='http://osg-itb2.dpcc.uta.edu:8000/dq2/' LRC_HOST='osg-itb2.dpcc.uta.edu' LRC_USER='dq2user' LRC_PWD='dqpwd' STORAGE_LOC='/data41/ATLAS/dq2' DATEFORMAT='%d-%b-%Y' NOW=time.time() ####################################################################################### class Collector: def __init__(self,output=None): self.Files = [] if output: self.OutName = output else: self.OutName = 'results.csv' self.OutFile = open(self.OutName,'w') def addFile(self,file,cur): lf = LocalFile(file) self.Files.append(lf) if len(self.Files) >= 50: self.runChecks(cur) def empty(self): self.Files = [] def full(self,cur): self.runChecks(cur) def runChecks(self,cur): for f in self.Files: f.checkLRC(cur) self.checkRemoteLRC() for f in self.Files: if f.shouldDelete(): f.delete(cur) self.checkWebService() for f in self.Files: f.printStatus(self.OutFile) self.empty() def checkRemoteLRC(self): Mapper = {} lfns='lfns=' for f in self.Files: lfns = lfns + f.LFN + '+' Mapper[f.LFN] = f lfns=lfns[0:-1] f = urllib.urlopen( 'http://dms02.usatlas.bnl.gov:8000/dq2/lrc/PoolFileCatalog?'+lfns ) ret = f.read() if ret.find('POOLFILECATALOG') != -1: xmldoc = minidom.parseString(ret) fileList = xmldoc.getElementsByTagName("File") for thisfile in fileList: rlfn = str(thisfile.getElementsByTagName("lfn")[0].getAttribute("name")) Mapper[rlfn].InBNLLRC = True rpfn = str(thisfile.getElementsByTagName("pfn")[0].getAttribute("name")) if rpfn.find('bnl.gov') != -1: Mapper[rlfn].CopyAtBNL = True def checkWebService(self): Mapper = {} lfns='lfns=' for f in self.Files: lfns = lfns + f.LFN + '+' Mapper[f.LFN] = f lfns=lfns[0:-1] f = urllib.urlopen( SITE_HTTP_ID+'lrc/PoolFileCatalog?'+lfns ) ret = f.read() if ret.find('POOLFILECATALOG') != -1: xmldoc = minidom.parseString(ret) fileList = xmldoc.getElementsByTagName("File") for thisfile in fileList: rlfn = str(thisfile.getElementsByTagName("lfn")[0].getAttribute("name")) Mapper[rlfn].LRCDeleted = False class LocalFile: def __init__(self,path): # Physical file properties self.pathname = path ostat = os.stat(self.pathname) self.CTime = ostat.st_ctime self.MTime = ostat.st_mtime self.ATime = ostat.st_atime self.Size = ostat.st_size # Metadata from LRC (BNL's or LRC?'s) self.LFN = os.path.basename(self.pathname) self.GUID = None self.LastModified = None self.Archival = None # Where does this exist? self.InLRC = False self.InBNLLRC = False self.CopyAtBNL = False #Actions self.DeleteMe = False self.NoDeleteReason = None self.PhysDeleted = False self.LRCDeleted = True def checkLRC(self,cur): sql = "select t_lfn.guid, t_meta.lastmodified, t_meta.archival from t_lfn, t_meta where t_lfn.lfname='%s' and t_lfn.guid=t_meta.guid" % (self.LFN) # sql = "select guid from t_lfn where lfn='%s'" % (self.LFN) res = QueryAll(cur,sql) if len(res) > 0: self.InLRC = True self.GUID = res[0][0] try: self.Archival = res[0][2].tostring() except: self.Archival = '' try: temp = res[0][1].tostring() self.LastModified = int(temp) except: pass def shouldDelete(self): self.DeleteMe = False if self.Archival == 'P': self.NoDeleteReason = "Pinned" elif self.InBNLLRC == False: # # Do I need some idea of a time limit on this based on # Mtime of the physical copy # if NOW - self.MTime <= (21 * 24 * 3600): self.NoDeleteReason = "BNL_LRC" else: self.DeleteMe = True elif self.CopyAtBNL == False: self.NoDeleteReason = "BNL_COPY" else: if self.InLRC == True: # May need to look at this; What if file not in LRC? # What if no timestamp in LastModified???? if self.LastModified: if NOW - self.LastModified <= ELAPSED_LAST_TOUCH: self.NoDeleteReason = "NEW" else: #Older than 5 days self.DeleteMe = True else: #Without timestamp what to do? self.NoDeleteReason = "NoTime" else: self.DeleteMe = True return self.DeleteMe def delete(self,cur): try: os.unlink(self.pathname) self.PhysDeleted = True except: print "Deletion failed on %s with error: %s" % (self.pathname,sys.exc_info()[1]) return sqldel = "delete from t_lfn where guid='%s'" %self.GUID QueryUpdate(cur,sqldel) sqldel = "delete from t_pfn where guid='%s'" %self.GUID QueryUpdate(cur,sqldel) sqldel = "delete from t_meta where guid='%s'" %self.GUID QueryUpdate(cur,sqldel) # Should this be verified later? # fields = 'lfns=' + self.LFN # f = urllib.urlopen( SITE_HTTP_ID+'lrc/PoolFileCatalog?'+fields ) # ret = f.read() # if ret.find('LFNs not found') != -1: # self.LRCDeleted = True def printStatus(self,file=None): #PATHNAME, LRC, BNLLRC, BNLCopy, guid, lastModifed, Archival, PhysMtime, Deleteme, NoDeleteReason, PhysDeleted, LRCDeleted status= "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s, %s" % (self.pathname, self.InLRC, self.InBNLLRC, self.CopyAtBNL, self.GUID, self.formatTime(self.LastModified), self.Archival, self.formatTime(self.MTime), self.DeleteMe, self.NoDeleteReason, self.PhysDeleted, self.LRCDeleted) if file: file.write(status + '\n') else: print status def problemFile(self): if self.DeleteMe == True and (self.PhysDeleted == False or self.LRCDeleted == False): return True elif self.InLRC == False and self.DeleteMe == False: return True def formatTime(self,field): if field: return time.strftime(DATEFORMAT,time.localtime(field)) else: return time.strftime(DATEFORMAT,time.localtime(0)) def connectDB(dbname,dbhost,dbuser,dbpwd) : dbcur = 0 try : pdb=MySQLdb.connect(db = dbname, host = dbhost, user = dbuser, passwd = dbpwd) pdb.autocommit(1) dbcur = pdb.cursor() return pdb,dbcur except : type, value, traceBack = sys.exc_info() print "dbname connect : %s %s" % (type,value) print "Connection Failed. Exit" exit() def connClose(cursor) : cursor.close() def dbClose(pdb) : pdb.close() def QueryAll(cursor,query) : cursor.execute(query) dbrows=cursor.fetchall() if len(dbrows)<1 : dbrow = -1 return dbrows def QueryUpdate(cursor,query) : r=cursor.execute(query) return r def runit() : (pdb,dbcur) = connectDB('localreplicas',LRC_HOST,LRC_USER,LRC_PWD) c=Collector() for path, subdirs, files in os.walk(STORAGE_LOC): for name in files: pfn = join(path, name) c.addFile(pfn,dbcur) c.full(dbcur) dbClose(pdb) connClose(dbcur) def clean_dirs(): for path, subdirs, files in os.walk(STORAGE_LOC,topdown=False): if len(files) == 0 and len(subdirs)== 0: try: os.rmdir(path) print "deleted directory:",path except: print "Failed to delete directory:",path elif len(files) > 0: print "%d files found in %s" % (len(files),path) if __name__ == '__main__': runit() clean_dirs()