[pacman-dev] [RFC] Iterator interface for reading databases

Dan McGee dpmcgee at gmail.com
Sat Jul 16 13:48:14 EDT 2011


On Sat, Jul 16, 2011 at 11:32 AM, Rémy Oudompheng
<remyoudompheng at gmail.com> wrote:
> Hello,
>
> I am thinking an iterator-like interface for database reading could be
> useful. The most obvious example use is the pkgfile functionality.
I don't disagree with the thinking.

> It could be a convenient tool to do one-pass work on databases
> without loading them completely in memory, which can be impossible for
> databases with files info (I don't think everyone can load the whole
> [community] database using libalpm standard structures).
Have you analyzed the heap using massif to see what is going on here?
I doubt we break the ~65 MB mark (with one database), which is big,
but not that ridiculous.

I guess the implementation below seems rather hacky to me, as it
breaks at least two invariants we have:
* Local DB entries are never loaded more than once
* The caller is responsible for freeing the package (I think?)

Wouldn't a much simpler solution be to just have a corollary method to
alpm_pkg_get_files() that released the memory used by the files data,
and dropped the infrq level back down? At least for local packages.
With that said, this gets immensely tricker when it comes to sync
databases as we don't lazy load anything there.

So I did some profiling with my sync db loading files branch using
massif, results attached. This includes loading 6 repos (testing,
core, extra, multilib, community-testing, community). Use ms_print on
the file to see what is going on here and get a pretty graph, but the
short summary:
* Heap usage peaks at ~263 MB
* A ton of that is filelists (at least 66% if not more, see sample 48
breakdown, the three biggest allocations are all my filelist alloc
block of code)
* Our useful heap/extra heap ratio sucks. We could allocate memory a
whole lot smarter apparently. (I have another WIP filelists as arrays
patch which would probably help a lot, as you don't need so many damn
pointers for the linked list layout).

> Here's a patch that roughly shows what I am thinking about: only the code
> for the "local" backend is shown.
>
> diff --git a/lib/libalpm/alpm.h b/lib/libalpm/alpm.h
> index 6e1e4bc..986e9a1 100644
> --- a/lib/libalpm/alpm.h
> +++ b/lib/libalpm/alpm.h
> @@ -119,6 +119,7 @@ typedef enum _alpm_sigstatus_t {
>
>  typedef struct __alpm_handle_t alpm_handle_t;
>  typedef struct __alpm_db_t alpm_db_t;
> +typedef struct __alpm_dbreader_t alpm_dbreader_t;
>  typedef struct __alpm_pkg_t alpm_pkg_t;
>  typedef struct __alpm_trans_t alpm_trans_t;
>
> @@ -371,6 +372,9 @@ int alpm_option_set_default_siglevel(alpm_handle_t *handle, alpm_siglevel_t leve
>  * @{
>  */
>
> +alpm_dbreader_t *alpm_open_dbdir(alpm_handle_t *handle, const char *dbpath);
> +alpm_dbreader_t *alpm_open_dbarchive(alpm_handle_t *handle, const char *dbpath);
> +
>  /** Get the database of locally installed packages.
>  * The returned pointer points to an internal structure
>  * of libalpm which should only be manipulated through
> diff --git a/lib/libalpm/be_local.c b/lib/libalpm/be_local.c
> index 70f242d..74d6886 100644
> --- a/lib/libalpm/be_local.c
> +++ b/lib/libalpm/be_local.c
> @@ -41,15 +41,22 @@
>  #include "package.h"
>  #include "deps.h"
>
> -static int local_db_read(alpm_pkg_t *info, alpm_dbinfrq_t inforeq);
> +static int local_db_read(const char *path, alpm_pkg_t *info, alpm_dbinfrq_t inforeq);
>
>  #define LAZY_LOAD(info, errret) \
>        do { \
>                if(!(pkg->infolevel & info)) { \
> -                       local_db_read(pkg, info); \
> +                       local_db_read(NULL, pkg, info); \
>                } \
>        } while(0)
>
> +struct local_dbreader_t {
> +       alpm_handle_t *handle;
> +       alpm_pkg_t *(*getitem) (alpm_dbreader_t*);
> +       DIR* dbdir;
> +       const char *dbpath;
> +       char finished;
> +};
>
>  /* Cache-specific accessor functions. These implementations allow for lazy
>  * loading by the files backend when a data member is actually needed
> @@ -235,7 +242,7 @@ static int _cache_changelog_close(const alpm_pkg_t UNUSED *pkg, void *fp)
>
>  static int _cache_force_load(alpm_pkg_t *pkg)
>  {
> -       return local_db_read(pkg, INFRQ_ALL);
> +       return local_db_read(NULL, pkg, INFRQ_ALL);
>  }
>
>
> @@ -376,9 +383,10 @@ static int local_db_populate(alpm_db_t *db)
>        size_t est_count;
>        int count = 0;
>        struct stat buf;
> -       struct dirent *ent = NULL;
>        const char *dbpath;
>        DIR *dbdir;
> +       alpm_dbreader_t *reader;
> +       alpm_pkg_t *pkg;
>
>        dbpath = _alpm_db_path(db);
>        if(dbpath == NULL) {
> @@ -408,7 +416,6 @@ static int local_db_populate(alpm_db_t *db)
>                while(readdir(dbdir) != NULL) {
>                        est_count++;
>                }
> -               rewinddir(dbdir);
>        }
>        if(est_count >= 2) {
>                /* subtract the two extra pointers to get # of children */
> @@ -418,35 +425,13 @@ static int local_db_populate(alpm_db_t *db)
>        /* initialize hash at 50% full */
>        db->pkgcache = _alpm_pkghash_create(est_count * 2);
>        if(db->pkgcache == NULL){
> -               closedir(dbdir);
>                RET_ERR(db->handle, ALPM_ERR_MEMORY, -1);
>        }
> +       closedir(dbdir);
>
> -       while((ent = readdir(dbdir)) != NULL) {
> -               const char *name = ent->d_name;
> -
> -               alpm_pkg_t *pkg;
> -
> -               if(strcmp(name, ".") == 0 || strcmp(name, "..") == 0) {
> -                       continue;
> -               }
> -               if(!is_dir(dbpath, ent)) {
> -                       continue;
> -               }
> -
> -               pkg = _alpm_pkg_new();
> -               if(pkg == NULL) {
> -                       closedir(dbdir);
> -                       RET_ERR(db->handle, ALPM_ERR_MEMORY, -1);
> -               }
> -               /* split the db entry name */
> -               if(_alpm_splitname(name, &(pkg->name), &(pkg->version),
> -                                       &(pkg->name_hash)) != 0) {
> -                       _alpm_log(db->handle, ALPM_LOG_ERROR, _("invalid name for database entry '%s'\n"),
> -                                       name);
> -                       _alpm_pkg_free(pkg);
> -                       continue;
> -               }
> +       reader = alpm_open_dbdir(db->handle, dbpath);
> +       while((pkg = reader->getitem(reader))) {
> +               pkg->origin_data.db = db;
>
>                /* duplicated database entries are not allowed */
>                if(_alpm_pkghash_find(db->pkgcache, pkg->name)) {
> @@ -455,18 +440,6 @@ static int local_db_populate(alpm_db_t *db)
>                        continue;
>                }
>
> -               pkg->origin = PKG_FROM_LOCALDB;
> -               pkg->origin_data.db = db;
> -               pkg->ops = &local_pkg_ops;
> -               pkg->handle = db->handle;
> -
> -               /* explicitly read with only 'BASE' data, accessors will handle the rest */
> -               if(local_db_read(pkg, INFRQ_BASE) == -1) {
> -                       _alpm_log(db->handle, ALPM_LOG_ERROR, _("corrupted database entry '%s'\n"), name);
> -                       _alpm_pkg_free(pkg);
> -                       continue;
> -               }
> -
>                /* add to the collection */
>                _alpm_log(db->handle, ALPM_LOG_FUNCTION, "adding '%s' to package cache for db '%s'\n",
>                                pkg->name, db->treename);
> @@ -474,7 +447,6 @@ static int local_db_populate(alpm_db_t *db)
>                count++;
>        }
>
> -       closedir(dbdir);
>        if(count > 0) {
>                db->pkgcache->list = alpm_list_msort(db->pkgcache->list, (size_t)count, _alpm_pkg_cmp);
>        }
> @@ -516,13 +488,13 @@ static char *get_pkgpath(alpm_db_t *db, alpm_pkg_t *info)
>        f = alpm_list_add(f, linedup); \
>  } while(1) /* note the while(1) and not (0) */
>
> -static int local_db_read(alpm_pkg_t *info, alpm_dbinfrq_t inforeq)
> +static int local_db_read(const char* pkgpath, alpm_pkg_t *info, alpm_dbinfrq_t inforeq)
>  {
>        FILE *fp = NULL;
>        char path[PATH_MAX];
>        char line[1024];
> -       char *pkgpath = NULL;
> -       alpm_db_t *db = info->origin_data.db;
> +  alpm_db_t *db = info->origin_data.db;
> +       const char *treename = db ? db->treename : "NULL";
>
>        /* bitmask logic here:
>         * infolevel: 00001111
> @@ -533,18 +505,20 @@ static int local_db_read(alpm_pkg_t *info, alpm_dbinfrq_t inforeq)
>                /* already loaded all of this info, do nothing */
>                return 0;
>        }
> -       _alpm_log(db->handle, ALPM_LOG_FUNCTION, "loading package data for %s : level=0x%x\n",
> +       _alpm_log(info->handle, ALPM_LOG_FUNCTION, "loading package data for %s : level=0x%x\n",
>                        info->name, inforeq);
>
>        /* clear out 'line', to be certain - and to make valgrind happy */
>        memset(line, 0, sizeof(line));
>
> -       pkgpath = get_pkgpath(db, info);
> +       if (pkgpath == NULL) {
> +               pkgpath = get_pkgpath(db, info);
> +       }
>
>        if(access(pkgpath, F_OK)) {
>                /* directory doesn't exist or can't be opened */
> -               _alpm_log(db->handle, ALPM_LOG_DEBUG, "cannot find '%s-%s' in db '%s'\n",
> -                               info->name, info->version, db->treename);
> +               _alpm_log(info->handle, ALPM_LOG_DEBUG, "cannot find '%s-%s' in db '%s'\n",
> +                               info->name, info->version, db ? treename : "NULL");
>                goto error;
>        }
>
> @@ -552,7 +526,7 @@ static int local_db_read(alpm_pkg_t *info, alpm_dbinfrq_t inforeq)
>        if(inforeq & INFRQ_DESC && !(info->infolevel & INFRQ_DESC)) {
>                snprintf(path, PATH_MAX, "%sdesc", pkgpath);
>                if((fp = fopen(path, "r")) == NULL) {
> -                       _alpm_log(db->handle, ALPM_LOG_ERROR, _("could not open file %s: %s\n"), path, strerror(errno));
> +                       _alpm_log(info->handle, ALPM_LOG_ERROR, _("could not open file %s: %s\n"), path, strerror(errno));
>                        goto error;
>                }
>                while(!feof(fp)) {
> @@ -560,14 +534,14 @@ static int local_db_read(alpm_pkg_t *info, alpm_dbinfrq_t inforeq)
>                        if(strcmp(line, "%NAME%") == 0) {
>                                READ_NEXT();
>                                if(strcmp(line, info->name) != 0) {
> -                                       _alpm_log(db->handle, ALPM_LOG_ERROR, _("%s database is inconsistent: name "
> -                                                               "mismatch on package %s\n"), db->treename, info->name);
> +                                       _alpm_log(info->handle, ALPM_LOG_ERROR, _("%s database is inconsistent: name "
> +                                                               "mismatch on package %s\n"), treename, info->name);
>                                }
>                        } else if(strcmp(line, "%VERSION%") == 0) {
>                                READ_NEXT();
>                                if(strcmp(line, info->version) != 0) {
> -                                       _alpm_log(db->handle, ALPM_LOG_ERROR, _("%s database is inconsistent: version "
> -                                                               "mismatch on package %s\n"), db->treename, info->name);
> +                                       _alpm_log(info->handle, ALPM_LOG_ERROR, _("%s database is inconsistent: version "
> +                                                               "mismatch on package %s\n"), treename, info->name);
>                                }
>                        } else if(strcmp(line, "%DESC%") == 0) {
>                                READ_AND_STORE(info->desc);
> @@ -625,7 +599,7 @@ static int local_db_read(alpm_pkg_t *info, alpm_dbinfrq_t inforeq)
>        if(inforeq & INFRQ_FILES && !(info->infolevel & INFRQ_FILES)) {
>                snprintf(path, PATH_MAX, "%sfiles", pkgpath);
>                if((fp = fopen(path, "r")) == NULL) {
> -                       _alpm_log(db->handle, ALPM_LOG_ERROR, _("could not open file %s: %s\n"), path, strerror(errno));
> +                       _alpm_log(info->handle, ALPM_LOG_ERROR, _("could not open file %s: %s\n"), path, strerror(errno));
>                        goto error;
>                }
>                while(fgets(line, sizeof(line), fp)) {
> @@ -664,11 +638,9 @@ static int local_db_read(alpm_pkg_t *info, alpm_dbinfrq_t inforeq)
>        /* internal */
>        info->infolevel |= inforeq;
>
> -       free(pkgpath);
>        return 0;
>
>  error:
> -       free(pkgpath);
>        if(fp) {
>                fclose(fp);
>        }
> @@ -913,4 +885,96 @@ alpm_db_t *_alpm_db_register_local(alpm_handle_t *handle)
>        return db;
>  }
>
> +static alpm_pkg_t *local_dbreader_getitem(struct local_dbreader_t *reader) {
> +       struct dirent *ent = NULL;
> +       reader->handle->pm_errno = 0;
> +       if (reader->finished) {
> +               /* iteration has already finished */
> +               RET_ERR(reader->handle, ALPM_ERR_WRONG_ARGS, NULL);
> +       }
> +       if (reader->dbdir == NULL) {
> +               reader->finished = 1;
> +               return NULL;
> +       }
> +
> +       while(1) {
> +               ent = readdir(reader->dbdir);
> +               if (ent == NULL) {
> +                       closedir(reader->dbdir);
> +                       reader->finished = 1;
> +                       return NULL;
> +               }
> +               const char *name = ent->d_name;
> +               char fullpath[PATH_MAX];
> +               snprintf(fullpath, PATH_MAX, "%s/%s", reader->dbpath, name);
> +
> +               alpm_pkg_t *pkg;
> +
> +               if(strcmp(name, ".") == 0 || strcmp(name, "..") == 0) {
> +                       continue;
> +               }
> +               if(!is_dir(reader->dbpath, ent)) {
> +                       continue;
> +               }
> +
> +               pkg = _alpm_pkg_new();
> +               if(pkg == NULL) {
> +                       closedir(reader->dbdir);
> +                       reader->finished = 1;
> +                       RET_ERR(reader->handle, ALPM_ERR_MEMORY, NULL);
> +               }
> +               pkg->handle = reader->handle;
> +               /* split the db entry name */
> +               if(_alpm_splitname(name, &(pkg->name), &(pkg->version),
> +                                       &(pkg->name_hash)) != 0) {
> +                       _alpm_log(reader->handle, ALPM_LOG_ERROR, _("invalid name for database entry '%s'\n"),
> +                                       name);
> +                       _alpm_pkg_free(pkg);
> +                       continue;
> +               }
> +
> +               pkg->origin = PKG_FROM_LOCALDB;
> +               pkg->origin_data.db = NULL;
> +               pkg->ops = &local_pkg_ops;
> +
> +               /* explicitly read with only 'BASE' data, accessors will handle the rest */
> +               if(local_db_read(fullpath, pkg, INFRQ_BASE) == -1) {
> +                       _alpm_log(reader->handle, ALPM_LOG_ERROR, _("corrupted database entry '%s'\n"), name);
> +                       _alpm_pkg_free(pkg);
> +                       continue;
> +               }
> +               return pkg;
> +       }
> +}
> +
> +alpm_dbreader_t SYMEXPORT *alpm_open_dbdir(alpm_handle_t *handle, const char *dbpath) {
> +       struct local_dbreader_t *reader = malloc(sizeof(struct local_dbreader_t));
> +       DIR *dbdir;
> +       if(handle == NULL) {
> +               return NULL;
> +       }
> +       if(reader == NULL) {
> +               RET_ERR(handle, ALPM_ERR_MEMORY, NULL);
> +       }
> +       if(dbpath == NULL) {
> +               RET_ERR(handle, ALPM_ERR_WRONG_ARGS, NULL);
> +       }
> +
> +       /* start reading */
> +       dbdir = opendir(dbpath);
> +       if(dbdir == NULL) {
> +               if(errno != ENOENT) {
> +                       /* no database existing yet is not an error */
> +                       free(reader);
> +                       return NULL;
> +               }
> +       }
> +       reader->getitem = &local_dbreader_getitem;
> +       reader->handle = handle;
> +       reader->dbdir = dbdir;
> +       reader->dbpath = dbpath;
> +       reader->finished = 0;
> +       return (alpm_dbreader_t*)reader;
> +}
> +
>  /* vim: set ts=2 sw=2 noet: */
> diff --git a/lib/libalpm/db.h b/lib/libalpm/db.h
> index a950130..3ce5889 100644
> --- a/lib/libalpm/db.h
> +++ b/lib/libalpm/db.h
> @@ -56,6 +56,11 @@ struct db_operations {
>        void (*unregister) (alpm_db_t *);
>  };
>
> +struct __alpm_dbreader_t {
> +       alpm_handle_t *handle;
> +       alpm_pkg_t *(*getitem) (alpm_dbreader_t*);
> +};
> +
>  /* Database */
>  struct __alpm_db_t {
>        alpm_handle_t *handle;
>
> --
> Rémy.
>
>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: massif.out.24883
Type: application/octet-stream
Size: 38281 bytes
Desc: not available
URL: <http://mailman.archlinux.org/pipermail/pacman-dev/attachments/20110716/53e29f83/attachment-0001.obj>


More information about the pacman-dev mailing list