Namcap being slow pissed me off enough that I wanted to figure out what could be taking so long. As is usual with software, 1 module was causing about 90% of the slowdown. Compare the before and after with this patch: $ time namcap -m -r depends {4 packages} >/dev/null real 0m18.001s user 0m9.313s sys 0m7.966s $ time ./namcap.py -m -r depends {4 packages }>/dev/null real 0m4.512s user 0m3.770s sys 0m0.740s This patch addresses a few issue noticed when crawling and profiling the code using cProfile. First, we were doing O(n*m) string compares of files in the local database with libraries in the package. We can use a few tricks to cut down the number of checks- namely, once a library has been found we don't need to look for it anymore, and we can also ignore any non-'.so' files. Another part where we were really shooting ourselves in the foot was invoking readelf on *every single file* in the package. Cut that down so we only call readelf on files with the ELF magic bytes, which gives us a dramatic savings as we no longer fork for files in /usr/share, for instance. Bonus material: script checking left out perl, and script dependencies were formerly listed as link-level dependencies as well. Both of these have been fixed. The sad part is there is still room to improve here, but I decided enough was enough for now. :) Signed-off-by: Dan McGee <dan@archlinux.org> --- Namcap/depends.py | 104 ++++++++++++++++++++++++++++++++--------------------- namcap-tags | 2 +- 2 files changed, 64 insertions(+), 42 deletions(-) diff --git a/Namcap/depends.py b/Namcap/depends.py index e851e5c..4a3f733 100644 --- a/Namcap/depends.py +++ b/Namcap/depends.py @@ -19,7 +19,7 @@ import re, os, os.path, pacman, subprocess -supported_scripts = ['python', 'ruby', 'wish', 'expect', +supported_scripts = ['python', 'perl', 'ruby', 'wish', 'expect', 'tk', 'bash', 'sh', 'dash', 'tcsh', 'pdksh' ] pkgcache = {} @@ -50,7 +50,8 @@ def getcovered(current, dependlist, covereddepend): def figurebitsize(line): """ - Given a line of output from readelf (usually Shared library:) return 'i686' or 'x86-64' if the binary is a 32bit or 64bit binary + Given a line of output from readelf (usually Shared library:) return + 'i686' or 'x86-64' if the binary is a 32bit or 64bit binary """ address = line.split()[0] @@ -59,6 +60,22 @@ def figurebitsize(line): else: return 'i686' +def is_elf(path): + """ + Given a file path, ensure it exists and peek at the first few bytes + to determine if it is an ELF file. + """ + if not os.path.isfile(path): + return False + fd = open(path) + bytes = fd.read(4) + fd.close() + # magic elf header, present in binaries and libraries + if bytes == "\x7FELF": + return True + else: + return False + def scanlibs(data, dirname, names): """ Walk over all the files in the package and run "readelf -d" on them. @@ -66,41 +83,39 @@ def scanlibs(data, dirname, names): """ sharedlibs, scripts = data global supported_scripts + shared = re.compile('Shared library: \[(.*)\]') + script = re.compile('#!.*/(.*)') for i in names: - if os.path.isfile(dirname+'/'+i): - var = subprocess.Popen('readelf -d ' + dirname+'/'+i, + path = dirname + '/' + i + if is_elf(path): + var = subprocess.Popen('readelf -d ' + path, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() for j in var[0].split('\n'): - n = re.search('Shared library: \[(.*)\]', j) + n = shared.search(j) # Is this a Shared library: line? if n != None: # Find out its architecture architecture = figurebitsize(j) try: libpath = os.path.abspath(libcache[architecture][n.group(1)])[1:] - sharedlibs.setdefault(libpath, {})[dirname+'/'+i] = 1 + sharedlibs.setdefault(libpath, {})[path] = 1 except KeyError: # Ignore that library if we can't find it # TODO: review it pass - # But we can check to see if it's a script we know about - else: - try: - fd = open(dirname+'/'+i) - except: - continue - firstline = fd.readline() - firstlinere = re.match('#!.*/(.*)', firstline) - if firstlinere != None: - cmdname = firstlinere.group(1).split()[0] - if cmdname in supported_scripts: - scripts.setdefault(cmdname, {})[dirname+'/'+i] = 1 - - fd.close() - return - + # Maybe it is a script file + elif os.path.isfile(path): + fd = open(path) + firstline = fd.readline() + fd.close() + firstlinere = script.match(firstline) + if firstlinere != None: + cmdname = firstlinere.group(1).split()[0] + if cmdname in supported_scripts: + scripts.setdefault(cmdname, {})[path] = 1 + def finddepends(liblist): dependlist = {} foundlist = [] @@ -108,24 +123,28 @@ def finddepends(liblist): somatches = {} actualpath = {} - for j in liblist.keys(): + knownlibs = liblist.keys() + + for j in knownlibs: actualpath[j] = os.path.realpath('/'+j)[1:] # Sometimes packages don't include all so .so, .so.1, .so.1.13, .so.1.13.19 files # They rely on ldconfig to create all the symlinks # So we will strip off the matching part of the files and use this regexp to match the rest so_end = re.compile('(\.\d+)*') + # Whether we should even look at a particular file + is_so = re.compile('\.so') pacmandb = '/var/lib/pacman/local' for i in os.listdir(pacmandb): if os.path.isfile(pacmandb+'/'+i+'/files'): file = open(pacmandb+'/'+i+'/files') for j in file.readlines(): - if j[len(j)-1:]=='\n': - j = j[:len(j)-1] + if not is_so.search(j): + continue - for k in liblist.keys(): - # If the file is an exact match, so it's a match up to a point and everything after that point matches a the regexp + for k in knownlibs: + # File must be an exact match or have the right .so ending numbers # i.e. gpm includes libgpm.so and libgpm.so.1.19.0, but everything links to libgpm.so.1 # We compare find libgpm.so.1.19.0 startswith libgpm.so.1 and .19.0 matches the regexp if j == actualpath[k] or (j.startswith(actualpath[k]) and so_end.match(j[len(actualpath[k]):])): @@ -134,13 +153,14 @@ def finddepends(liblist): dependlist[n.group(1)] = {} for x in liblist[k]: dependlist[n.group(1)][x] = 1 + knownlibs.remove(k) foundlist.append(k) file.close() ret = [] - for i in liblist.keys(): - if i not in foundlist: - ret.append('Library ' + i + ' has no package associated') + # knownlibs only contains what hasn't been found at this point + for i in knownlibs: + ret.append('Library ' + i + ' has no package associated') return dependlist, ret def getprovides(depends, provides): @@ -155,8 +175,9 @@ def filllibcache(): shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() + libline = re.compile('\s*(.*) \((.*)\) => (.*)') for j in var[0].split('\n'): - g = re.match('\s*(.*) \((.*)\) => (.*)',j) + g = libline.match(j) if g != None: if g.group(2).startswith('libc6,x86-64'): libcache['x86-64'][g.group(1)] = g.group(3) @@ -190,6 +211,16 @@ class package: for i in tmpret: ret[1].append(i) + # Remove the package name from that list, we can't depend on ourselves. + if dependlist.has_key(pkginfo.name): + del dependlist[pkginfo.name] + + # Print link-level deps + for i, v in dependlist.iteritems(): + if type(v) == dict: + files = [x[len(data)+1:] for x in v.keys()] + ret[2].append(("link-level-dependence %s in %s", (i, str(files)))) + # Do the script handling stuff for i, v in liblist[1].iteritems(): if not dependlist.has_key(i): @@ -199,16 +230,6 @@ class package: files = [x[len(data)+1:] for x in v.keys()] ret[2].append(("script-link-detected %s in %s", (i, str(files)))) - # Remove the package name from that list, we can't depend on ourselves. - if dependlist.has_key(pkginfo.name): - del dependlist[pkginfo.name] - - # Do the info stuff - for i, v in dependlist.iteritems(): - if type(v) == dict: - files = [x[len(data)+1:] for x in v.keys()] - ret[2].append(("link-level-dependence %s on %s", (str(files), i))) - # Check for packages in testing if os.path.isdir('/var/lib/pacman/sync/testing'): for i in dependlist.keys(): @@ -267,4 +288,5 @@ class package: return ret def type(self): return "tarball" + # vim: set ts=4 sw=4 noet: diff --git a/namcap-tags b/namcap-tags index c6f170e..710077c 100644 --- a/namcap-tags +++ b/namcap-tags @@ -35,7 +35,7 @@ incorrect-permissions %s (%s/%s) :: File (%s) has %s/%s permissions. info-dir-file-present %s :: Info directory file (%s) needs removed. insecure-rpath %s :: Insecure RPATH (%s). If present, RPATH should be only /usr/lib. libtool-file-present %s :: File (%s) is a libtool file. -link-level-dependence %s on %s :: File '%s' link-level dependence on %s +link-level-dependence %s in %s :: Link-level dependence (%s) in file %s mime-cache-not-updated :: Mime-file found. Add "update-mime-database usr/share/mime" to the install file missing-contributor :: Missing Contributor tag missing-custom-license-dir usr/share/licenses/%s :: Missing custom license directory (usr/share/licenses/%s) -- 1.6.4.4