On Tue, Apr 05, 2011 at 11:57:46PM -0700, elij wrote:
- remove need to use mysql for generating the sql - just consider categories an integer range, specified to the size of that in the aur-schema. - use the logging module instead of writing directly to stderr this makes the code cleaner as it removes the numerous tests for the value of DBUG, yet allows devs to control the level of output verbosity. --- support/schema/gendummydata.py | 106 +++++++++------------------------------ 1 files changed, 25 insertions(+), 81 deletions(-)
I agree with both changes, but please split that one into two separate patches.
diff --git a/support/schema/gendummydata.py b/support/schema/gendummydata.py index 7b1d0cf..8ed9f69 100755 --- a/support/schema/gendummydata.py +++ b/support/schema/gendummydata.py @@ -15,9 +15,9 @@ import os import sys import cStringIO import commands +import logging
- -DBUG = 1 +log_level = logging.DEBUG # logging level. set to logging.INFO to reduce output
I'm not a Python coder, but is there any reason to use lowercase here whereas we use uppercase for all other constants?
SEED_FILE = "/usr/share/dict/words" DB_HOST = os.getenv("DB_HOST", "localhost") DB_NAME = os.getenv("DB_NAME", "AUR") @@ -33,6 +33,7 @@ PKG_FILES = (8, 30) # min/max number of files in a package PKG_DEPS = (1, 5) # min/max depends a package has PKG_SRC = (1, 3) # min/max sources a package has PKG_CMNTS = (1, 5) # min/max number of comments a package has +CATEGORIES_COUNT = 17 # the number of categories from aur-schema VOTING = (0, .30) # percentage range for package voting RANDOM_PATHS = ( # random path locations for package files "/usr/bin", "/usr/lib", "/etc", "/etc/rc.d", "/usr/share", "/lib", @@ -45,44 +46,25 @@ RANDOM_URL = ("http://www.", "ftp://ftp.", "http://", "ftp://") RANDOM_LOCS = ("pub", "release", "files", "downloads", "src") FORTUNE_CMD = "/usr/bin/fortune -l"
+# setup logging +logformat = "%(levelname)s: %(message)s" +logging.basicConfig(format=logformat, level=log_level) +log = logging.getLogger()
if len(sys.argv) != 2: - sys.stderr.write("Missing output filename argument"); + log.error("Missing output filename argument") raise SystemExit
# make sure the seed file exists # if not os.path.exists(SEED_FILE): - sys.stderr.write("Please install the 'words' Arch package\n"); - raise SystemExit - -# Make sure database access will be available -# -try: - import MySQLdb -except: - sys.stderr.write("Please install the 'mysql-python' Arch package\n"); - raise SystemExit - -# try to connect to database -# -try: - db = MySQLdb.connect(host = DB_HOST, user = DB_USER, - db = DB_NAME, passwd = DB_PASS) - dbc = db.cursor() -except: - sys.stderr.write("Could not connect to database\n"); + log.error("Please install the 'words' Arch package") raise SystemExit
Shouldn't we rather use "sys.exit(1);" here instead of raising a SystemExit exception? That way we'd have a proper exit status, also. Might be something to include in the debugging/error handling patch.
-esc = db.escape_string - - # track what users/package names have been used # seen_users = {} seen_pkgs = {} -categories = {} -category_keys = [] user_keys = []
# some functions to generate random data @@ -95,14 +77,14 @@ def genVersion(): ver.append("%d" % random.randrange(0,100)) return ".".join(ver) + "-u%d" % random.randrange(1,11) def genCategory(): - return categories[category_keys[random.randrange(0,len(category_keys))]] + return random.randrange(0,CATEGORIES_COUNT) def genUID(): return seen_users[user_keys[random.randrange(0,len(user_keys))]]
# load the words, and make sure there are enough words for users/pkgs # -if DBUG: print "Grabbing words from seed file..." +log.debug("Grabbing words from seed file...") fp = open(SEED_FILE, "r") contents = fp.readlines() fp.close() @@ -117,7 +99,7 @@ else:
# select random usernames # -if DBUG: print "Generating random user names..." +log.debug("Generating random user names...") user_id = USER_ID while len(seen_users) < MAX_USERS: user = random.randrange(0, len(contents)) @@ -130,7 +112,7 @@ user_keys = seen_users.keys()
# select random package names # -if DBUG: print "Generating random package names..." +log.debug("Generating random package names...") num_pkgs = PKG_ID while len(seen_pkgs) < MAX_PKGS: pkg = random.randrange(0, len(contents)) @@ -149,22 +131,6 @@ while len(seen_pkgs) < MAX_PKGS: # contents = None
-# Load package categories from database -# -if DBUG: print "Loading package categories..." -q = "SELECT * FROM PackageCategories" -dbc.execute(q) -row = dbc.fetchone() -while row: - categories[row[1]] = row[0] - row = dbc.fetchone() -category_keys = categories.keys() - -# done with the database -# -dbc.close() -db.close() - # developer/tu IDs # developers = [] @@ -179,8 +145,7 @@ out.write("BEGIN;\n")
# Begin by creating the User statements # -if DBUG: print "Creating SQL statements for users.", -count = 0 +log.debug("Creating SQL statements for users.") for u in user_keys: account_type = 1 # default to normal user if not has_devs or not has_tus: @@ -201,22 +166,18 @@ for u in user_keys: # a normal user account # pass - + s = "INSERT INTO Users (ID, AccountTypeID, Username, Email, Passwd) VALUES (%d, %d, '%s', '%s@example.com', MD5('%s'));\n" % (seen_users[u], account_type, u, u, u) out.write(s) - if count % 10 == 0: - if DBUG: print ".", - count += 1 -if DBUG: print "." -if DBUG: - print "Number of developers:", len(developers) - print "Number of trusted users:", len(trustedusers) - print "Number of users:", (MAX_USERS-len(developers)-len(trustedusers)) - print "Number of packages:", MAX_PKGS + +log.debug("Number of developers: %d" % len(developers)) +log.debug("Number of trusted users: %d" % len(trustedusers)) +log.debug("Number of users: %d" % (MAX_USERS-len(developers)-len(trustedusers))) +log.debug("Number of packages: %d" % MAX_PKGS)
# Create the package statements # -if DBUG: print "Creating SQL statements for packages.", +log.debug("Creating SQL statements for packages.") count = 0 for p in seen_pkgs.keys(): NOW = int(time.time()) @@ -237,26 +198,21 @@ for p in seen_pkgs.keys(): genCategory(), NOW, uuid, muid)
out.write(s) - if count % 100 == 0: - if DBUG: print ".", count += 1
# create random comments for this package # num_comments = random.randrange(PKG_CMNTS[0], PKG_CMNTS[1]) for i in range(0, num_comments): - fortune = esc(commands.getoutput(FORTUNE_CMD).replace("'","")) + fortune = commands.getoutput(FORTUNE_CMD).replace("'","")
Why did you drop escape_string() here?
now = NOW + random.randrange(400, 86400*3) s = "INSERT INTO PackageComments (PackageID, UsersID, Comments, CommentTS) VALUES (%d, %d, '%s', %d);\n" % (seen_pkgs[p], genUID(), fortune, now) out.write(s)
-if DBUG: print "." - # Cast votes # track_votes = {} -if DBUG: print "Casting votes for packages.", -count = 0 +log.debug("Casting votes for packages.") for u in user_keys: num_votes = random.randrange(int(len(seen_pkgs)*VOTING[0]), int(len(seen_pkgs)*VOTING[1])) @@ -270,9 +226,6 @@ for u in user_keys: track_votes[pkg] = 0 track_votes[pkg] += 1 out.write(s) - if count % 100 == 0: - if DBUG: print ".", - count += 1
# Update statements for package votes # @@ -282,8 +235,7 @@ for p in track_votes.keys():
# Create package dependencies and sources # -if DBUG: print "."; print "Creating statements for package depends/sources.", -count = 0 +log.debug("Creating statements for package depends/sources.") for p in seen_pkgs.keys(): num_deps = random.randrange(PKG_DEPS[0], PKG_DEPS[1]) this_deps = {} @@ -307,17 +259,9 @@ for p in seen_pkgs.keys(): seen_pkgs[p], src) out.write(s)
- if count % 100 == 0: - if DBUG: print ".", - count += 1 - - # close output file # out.write("COMMIT;\n") out.write("\n") out.close() - -if DBUG: print "." -if DBUG: print "Done." - +log.debug("Done.") -- 1.7.4.1