[pacman-dev] [PATCHv2 5/5] be_package: Build the file list from MTREE if possible

Andrew Gregory andrew.gregory.8 at gmail.com
Sat Feb 1 09:36:27 EST 2014


On 01/28/14 at 11:14am, Florian Pritz wrote:
> This greatly speeds up file list generation times by avoiding
> uncompressing the whole package.
> 
> pacman -S base with a deliberate file conflict:
> before: 9.1 seconds
> after:  2.2 seconds
> 
> Signed-off-by: Florian Pritz <bluewind at xinu.at>
> ---
> 
> v2: Adjust for v2 of realloc patch.
> 
>  lib/libalpm/be_package.c | 153 ++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 151 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/libalpm/be_package.c b/lib/libalpm/be_package.c
> index 6ef84fc..d876967 100644
> --- a/lib/libalpm/be_package.c
> +++ b/lib/libalpm/be_package.c
> @@ -380,14 +380,34 @@ static int add_entry_to_files_list(alpm_pkg_t *pkg, size_t *files_size, struct a
>  {
>  	const size_t files_count = pkg->files.count;
>  	alpm_file_t *current_file;
> +	mode_t type;
> +	size_t pathlen;
>  	const char *path;
>  
>  	if(!_alpm_realloc((void **)&pkg->files.files, files_size, (files_count + 1) * sizeof(alpm_file_t))) {
>  		return -1;
>  	}
>  
> +	type = archive_entry_filetype(entry);
>  	path = archive_entry_pathname(entry);
>  
> +	pathlen = strlen(path);
> +
> +	/* mtree paths don't contain a tailing slash, those we get from
> +	 * the archive directly do (expensive way)
> +	 * Other code relies on it to detect directories so add it here.*/
> +	if(type == AE_IFDIR && path[pathlen - 1] != '/') {
> +		/* 2 = 1 for / + 1 for \0 */
> +		char *newpath = malloc(pathlen + 2);
> +		if (!newpath) {
> +			_alpm_alloc_fail(pathlen + 2);
> +			return -1;
> +		}
> +		strcpy(newpath, path);
> +		newpath[pathlen] = '/';
> +		newpath[pathlen + 1] = '\0';
> +		path = newpath;

Memory leak.  Just assign to current_file->name directly and avoid the
STRDUP for this case.

> +	}
>  	current_file = pkg->files.files + files_count;
>  	STRDUP(current_file->name, path, return -1);
>  	current_file->size = archive_entry_size(entry);
> @@ -397,6 +417,122 @@ static int add_entry_to_files_list(alpm_pkg_t *pkg, size_t *files_size, struct a
>  }
>  
>  /**
> + * Generate a new file list from an mtree file and add it to the package.
> + * An existing file list will be free()d first.
> + *
> + * archive should point to an archive struct which is already at the
> + * position of the mtree's header.
> + *
> + * @param handle
> + * @param pkg package to add the file list to
> + * @param archive archive containing the mtree
> + * @return 0 on success, <0 on error
> + */
> +static int build_filelist_from_mtree(alpm_handle_t *handle, alpm_pkg_t *pkg, struct archive *archive)
> +{
> +	int ret = 0;

We normally use 'ret' for or own return value.

> +	size_t mtree_msize = 0;
> +	size_t mtree_csize = 0;

These either need more descriptive names or a comment explaining what
they are.

> +	size_t files_size = 0; // we clean up the existing array so this is fine

We only use c-style comments

> +	char *mtree_data = NULL;
> +	struct archive *mtree;
> +	struct archive_entry *mtree_entry = NULL;
> +
> +	_alpm_log(handle, ALPM_LOG_DEBUG, "found mtree for package %s, getting file list\n", pkg->filename);

Please try to keep lines closer to 80 columns.

> +
> +	/* throw away any files we might have already found */
> +	free(pkg->files.files);

You need to free the file names as well.

> +	pkg->files.files = NULL;
> +	pkg->files.count = 0;
> +
> +	/* create a new archive to parse the mtree and load it from archive into memory */
> +	// TODO: split this into a function
> +	if((mtree = archive_read_new()) == NULL) {
> +		handle->pm_errno = ALPM_ERR_LIBARCHIVE;
> +		goto error;
> +	}
> +
> +	_alpm_archive_read_support_filter_all(mtree);
> +	archive_read_support_format_mtree(mtree);
> +
> +	// TODO: split this into a function
> +	while(1) {
> +		ssize_t size;
> +
> +		if(!_alpm_realloc((void **)&mtree_data, &mtree_msize, mtree_csize + ALPM_BUFFER_SIZE)) {
> +			goto error;
> +		}
> +
> +		size = archive_read_data(archive, mtree_data + mtree_csize, ALPM_BUFFER_SIZE);
> +
> +		if(size < 0) {
> +			_alpm_log(handle, ALPM_LOG_ERROR, _("error while reading package %s: %s\n"),
> +					pkg->filename, archive_error_string(archive));
> +			handle->pm_errno = ALPM_ERR_LIBARCHIVE;
> +			goto error;
> +		}
> +		if(size == 0) {
> +			break;
> +		}
> +
> +		mtree_csize += size;
> +	}
> +
> +	// TODO: need wrapper function for archive_read_open_memory for old libarchive versions?
> +	if(archive_read_open_memory(mtree, mtree_data, mtree_csize)) {
> +		_alpm_log(handle, ALPM_LOG_ERROR, _("error while reading mtree of package %s: %s\n"),
> +				pkg->filename, archive_error_string(mtree));
> +		handle->pm_errno = ALPM_ERR_LIBARCHIVE;
> +		goto error;
> +	}
> +
> +	while((ret = archive_read_next_header(mtree, &mtree_entry)) == ARCHIVE_OK) {
> +		const char *path = archive_entry_pathname(mtree_entry);
> +		char *p = NULL;
> +
> +		/* strip leading "./" from path entries */
> +		if(path[0] == '.' && path[1] == '/') {
> +			path += 2;
> +		}
> +
> +		if(handle_simple_path(pkg, path)) {
> +			continue;
> +		}
> +
> +		if(*path == '.') {
> +			continue;
> +		}

Please add a comment why we're skipping all files that start with '.'.  Maybe
this should just be moved into handle_simple_path

> +
> +		/* Apparently archive_entry_copy/set_pathname doesn't like to get it's own
> +		 * pointer and produces rather strange results */
> +		STRDUP(p, path, goto error);
> +		archive_entry_copy_pathname(mtree_entry, p);
> +
> +		if(add_entry_to_files_list(pkg, &files_size, mtree_entry) < 0) {

You've already extracted the entry's pathname everywhere you call
add_entry_to_file_list, how about just passing the path as an additional
parameter instead of dup'ing it and copying it back to the mtree_entry?

> +			free(p);
> +			goto error;
> +		}
> +		free(p);
> +	}
> +
> +	if(ret != ARCHIVE_EOF && ret != ARCHIVE_OK) { /* An error occurred */
> +		_alpm_log(handle, ALPM_LOG_ERROR, _("error while reading mtree of package %s: %s\n"),
> +				pkg->filename, archive_error_string(mtree));
> +		handle->pm_errno = ALPM_ERR_LIBARCHIVE;
> +		goto error;
> +	}
> +
> +	free(mtree_data);
> +	_alpm_archive_read_free(mtree);
> +	_alpm_log(handle, ALPM_LOG_DEBUG, "finished mtree reading for %s\n", pkg->filename);
> +	return 0;
> +error:
> +	free(mtree_data);
> +	_alpm_archive_read_free(mtree);
> +	return -1;
> +}
> +
> +/**
>   * Load a package and create the corresponding alpm_pkg_t struct.
>   * @param handle the context handle
>   * @param pkgfile path to the package file
> @@ -406,7 +542,7 @@ static int add_entry_to_files_list(alpm_pkg_t *pkg, size_t *files_size, struct a
>  alpm_pkg_t *_alpm_pkg_load_internal(alpm_handle_t *handle,
>  		const char *pkgfile, int full)
>  {
> -	int ret, fd, config = 0;
> +	int ret, fd, config, hit_mtree = 0;
>  	struct archive *archive;
>  	struct archive_entry *entry;
>  	alpm_pkg_t *newpkg;
> @@ -469,10 +605,23 @@ alpm_pkg_t *_alpm_pkg_load_internal(alpm_handle_t *handle,
>  			continue;
>  		} else if(handle_simple_path(newpkg, entry_name)) {
>  			continue;
> +		} else if(full && strcmp(entry_name, ".MTREE") == 0) {
> +			/* building the file list: cheap way
> +			 * get the filelist from the mtree file rather than scanning
> +			 * the whole archive  */
> +			if(build_filelist_from_mtree(handle, newpkg, archive) < 0) {
> +				goto error;
> +			}
> +			hit_mtree = 1;
> +			continue;
>  		} else if(*entry_name == '.') {
>  			/* for now, ignore all files starting with '.' that haven't
>  			 * already been handled (for future possibilities) */
> -		} else if(full) {
> +		} else if(hit_mtree && config) {
> +			/* we have all we need */
> +			break;

This is kind of a strange place for this test and is similar to an existing
check at the bottom of the loop.  Could you move them to the top of the loop
and consolidate them?

> +		} else if(full && !hit_mtree) {
> +			/* building the file list: expensive way */
>  			if(add_entry_to_files_list(newpkg, &files_size, entry) < 0) {
>  				goto error;
>  			}
> -- 
> 1.8.5.3


More information about the pacman-dev mailing list