[aur-dev] [PATCH 1/3] make gendummydata script more friendly

Lukas Fleischer archlinux at cryptocrack.de
Wed Apr 6 15:04:30 EDT 2011


On Tue, Apr 05, 2011 at 11:57:46PM -0700, elij wrote:
> - remove need to use mysql for generating the sql
> - just consider categories an integer range, specified to the size
>   of that in the aur-schema.
> - use the logging module instead of writing directly to stderr
>   this makes the code cleaner as it removes the numerous tests for the value
>   of DBUG, yet allows devs to control the level of output verbosity.
> ---
>  support/schema/gendummydata.py |  106 +++++++++------------------------------
>  1 files changed, 25 insertions(+), 81 deletions(-)
> 

I agree with both changes, but please split that one into two separate
patches.

> diff --git a/support/schema/gendummydata.py b/support/schema/gendummydata.py
> index 7b1d0cf..8ed9f69 100755
> --- a/support/schema/gendummydata.py
> +++ b/support/schema/gendummydata.py
> @@ -15,9 +15,9 @@ import os
>  import sys
>  import cStringIO
>  import commands
> +import logging
>  
> -
> -DBUG      = 1
> +log_level = logging.DEBUG # logging level. set to logging.INFO to reduce output

I'm not a Python coder, but is there any reason to use lowercase here
whereas we use uppercase for all other constants?

>  SEED_FILE = "/usr/share/dict/words"
>  DB_HOST   = os.getenv("DB_HOST", "localhost")
>  DB_NAME   = os.getenv("DB_NAME", "AUR")
> @@ -33,6 +33,7 @@ PKG_FILES = (8, 30)    # min/max number of files in a package
>  PKG_DEPS  = (1, 5)     # min/max depends a package has
>  PKG_SRC   = (1, 3)     # min/max sources a package has
>  PKG_CMNTS = (1, 5)     # min/max number of comments a package has
> +CATEGORIES_COUNT = 17  # the number of categories from aur-schema
>  VOTING    = (0, .30)   # percentage range for package voting
>  RANDOM_PATHS = (       # random path locations for package files
>  	"/usr/bin", "/usr/lib", "/etc", "/etc/rc.d", "/usr/share", "/lib",
> @@ -45,44 +46,25 @@ RANDOM_URL = ("http://www.", "ftp://ftp.", "http://", "ftp://")
>  RANDOM_LOCS = ("pub", "release", "files", "downloads", "src")
>  FORTUNE_CMD = "/usr/bin/fortune -l"
>  
> +# setup logging
> +logformat = "%(levelname)s: %(message)s"
> +logging.basicConfig(format=logformat, level=log_level)
> +log = logging.getLogger()
>  
>  if len(sys.argv) != 2:
> -	sys.stderr.write("Missing output filename argument");
> +	log.error("Missing output filename argument")
>  	raise SystemExit
>  
>  # make sure the seed file exists
>  #
>  if not os.path.exists(SEED_FILE):
> -	sys.stderr.write("Please install the 'words' Arch package\n");
> -	raise SystemExit
> -
> -# Make sure database access will be available
> -#
> -try:
> -	import MySQLdb
> -except:
> -	sys.stderr.write("Please install the 'mysql-python' Arch package\n");
> -	raise SystemExit
> -
> -# try to connect to database
> -#
> -try:
> -	db = MySQLdb.connect(host = DB_HOST, user = DB_USER,
> -			db = DB_NAME, passwd = DB_PASS)
> -	dbc = db.cursor()
> -except:
> -	sys.stderr.write("Could not connect to database\n");
> +	log.error("Please install the 'words' Arch package")
>  	raise SystemExit

Shouldn't we rather use "sys.exit(1);" here instead of raising a
SystemExit exception? That way we'd have a proper exit status, also.
Might be something to include in the debugging/error handling patch.

>  
> -esc = db.escape_string
> -
> -
>  # track what users/package names have been used
>  #
>  seen_users = {}
>  seen_pkgs = {}
> -categories = {}
> -category_keys = []
>  user_keys = []
>  
>  # some functions to generate random data
> @@ -95,14 +77,14 @@ def genVersion():
>  		ver.append("%d" % random.randrange(0,100))
>  	return ".".join(ver) + "-u%d" % random.randrange(1,11)
>  def genCategory():
> -	return categories[category_keys[random.randrange(0,len(category_keys))]]
> +	return random.randrange(0,CATEGORIES_COUNT)
>  def genUID():
>  	return seen_users[user_keys[random.randrange(0,len(user_keys))]]
>  
>  
>  # load the words, and make sure there are enough words for users/pkgs
>  #
> -if DBUG: print "Grabbing words from seed file..."
> +log.debug("Grabbing words from seed file...")
>  fp = open(SEED_FILE, "r")
>  contents = fp.readlines()
>  fp.close()
> @@ -117,7 +99,7 @@ else:
>  
>  # select random usernames
>  #
> -if DBUG: print "Generating random user names..."
> +log.debug("Generating random user names...")
>  user_id = USER_ID
>  while len(seen_users) < MAX_USERS:
>  	user = random.randrange(0, len(contents))
> @@ -130,7 +112,7 @@ user_keys = seen_users.keys()
>  
>  # select random package names
>  #
> -if DBUG: print "Generating random package names..."
> +log.debug("Generating random package names...")
>  num_pkgs = PKG_ID
>  while len(seen_pkgs) < MAX_PKGS:
>  	pkg = random.randrange(0, len(contents))
> @@ -149,22 +131,6 @@ while len(seen_pkgs) < MAX_PKGS:
>  #
>  contents = None
>  
> -# Load package categories from database
> -#
> -if DBUG: print "Loading package categories..."
> -q = "SELECT * FROM PackageCategories"
> -dbc.execute(q)
> -row = dbc.fetchone()
> -while row:
> -	categories[row[1]] = row[0]
> -	row = dbc.fetchone()
> -category_keys = categories.keys()
> -
> -# done with the database
> -#
> -dbc.close()
> -db.close()
> -
>  # developer/tu IDs
>  #
>  developers = []
> @@ -179,8 +145,7 @@ out.write("BEGIN;\n")
>  
>  # Begin by creating the User statements
>  #
> -if DBUG: print "Creating SQL statements for users.",
> -count = 0
> +log.debug("Creating SQL statements for users.")
>  for u in user_keys:
>  	account_type = 1  # default to normal user
>  	if not has_devs or not has_tus:
> @@ -201,22 +166,18 @@ for u in user_keys:
>  			# a normal user account
>  			#
>  			pass
> -	
> +
>  	s = "INSERT INTO Users (ID, AccountTypeID, Username, Email, Passwd) VALUES (%d, %d, '%s', '%s at example.com', MD5('%s'));\n" % (seen_users[u], account_type, u, u, u)
>  	out.write(s)
> -	if count % 10 == 0:
> -		if DBUG: print ".",
> -	count += 1
> -if DBUG: print "."
> -if DBUG:
> -	print "Number of developers:", len(developers)
> -	print "Number of trusted users:", len(trustedusers)
> -	print "Number of users:", (MAX_USERS-len(developers)-len(trustedusers))
> -	print "Number of packages:", MAX_PKGS
> +
> +log.debug("Number of developers: %d" % len(developers))
> +log.debug("Number of trusted users: %d" % len(trustedusers))
> +log.debug("Number of users: %d" % (MAX_USERS-len(developers)-len(trustedusers)))
> +log.debug("Number of packages: %d" % MAX_PKGS)
>  
>  # Create the package statements
>  #
> -if DBUG: print "Creating SQL statements for packages.",
> +log.debug("Creating SQL statements for packages.")
>  count = 0
>  for p in seen_pkgs.keys():
>  	NOW = int(time.time())
> @@ -237,26 +198,21 @@ for p in seen_pkgs.keys():
>  			genCategory(), NOW, uuid, muid)
>  
>  	out.write(s)
> -	if count % 100 == 0:
> -		if DBUG: print ".",
>  	count += 1
>  
>  	# create random comments for this package
>  	#
>  	num_comments = random.randrange(PKG_CMNTS[0], PKG_CMNTS[1])
>  	for i in range(0, num_comments):
> -		fortune = esc(commands.getoutput(FORTUNE_CMD).replace("'",""))
> +		fortune = commands.getoutput(FORTUNE_CMD).replace("'","")

Why did you drop escape_string() here?

>  		now = NOW + random.randrange(400, 86400*3)
>  		s = "INSERT INTO PackageComments (PackageID, UsersID, Comments, CommentTS) VALUES (%d, %d, '%s', %d);\n" % (seen_pkgs[p], genUID(), fortune, now)
>  		out.write(s)
>  
> -if DBUG: print "."
> -
>  # Cast votes
>  #
>  track_votes = {}
> -if DBUG: print "Casting votes for packages.",
> -count = 0
> +log.debug("Casting votes for packages.")
>  for u in user_keys:
>  	num_votes = random.randrange(int(len(seen_pkgs)*VOTING[0]),
>  			int(len(seen_pkgs)*VOTING[1]))
> @@ -270,9 +226,6 @@ for u in user_keys:
>  				track_votes[pkg] = 0
>  			track_votes[pkg] += 1
>  			out.write(s)
> -			if count % 100 == 0:
> -				if DBUG: print ".",
> -			count += 1
>  
>  # Update statements for package votes
>  #
> @@ -282,8 +235,7 @@ for p in track_votes.keys():
>  
>  # Create package dependencies and sources
>  #
> -if DBUG: print "."; print "Creating statements for package depends/sources.",
> -count = 0
> +log.debug("Creating statements for package depends/sources.")
>  for p in seen_pkgs.keys():
>  	num_deps = random.randrange(PKG_DEPS[0], PKG_DEPS[1])
>  	this_deps = {}
> @@ -307,17 +259,9 @@ for p in seen_pkgs.keys():
>  				seen_pkgs[p], src)
>  		out.write(s)
>  
> -	if count % 100 == 0:
> -		if DBUG: print ".",
> -	count += 1
> -
> -
>  # close output file
>  #
>  out.write("COMMIT;\n")
>  out.write("\n")
>  out.close()
> -
> -if DBUG: print "."
> -if DBUG: print "Done."
> -
> +log.debug("Done.")
> -- 
> 1.7.4.1


More information about the aur-dev mailing list