[pacman-dev] [PATCH] libalpm: add iterator interface for syncdb files

morganamilo morganamilo at archlinux.org
Fri Jun 4 21:15:55 UTC 2021


This commit adds an iterator interface for reading files from the
syncdbs. Instead of using alpm_pkg_get_files(), you now get the files
from the database using alpm_db_files_open(), you then use
alpm_db_files_next() to iterate through the files for each package. If
you want to actually load the files from that package you then use
alpm_db_files_load().

This means alpm_pkg_get_files() will always return empty for syncdbs,
even on .files databases, however these functions still work on the
localdb and loaded packages.

This aproach is faster when dumping the entire file list but slower when
searching for a specific package.

The memory usage of pacman is drastically less. See below.

build/pacman -Fl        0.55s user 0.01s system 99% cpu 0.556 total
build/pacman -Fl pacman 0.46s user 0.01s system 99% cpu 0.472 total
build/pacman -Fx pacman 2.88s user 0.09s system 99% cpu 2.965 total

pacman -Fl              1.60s user 0.13s system 99% cpu 1.731 total
pacman -Fl pacman       0.24s user 0.04s system 99% cpu 0.283 total
pacman -Fx pacman       2.45s user 0.14s system 99% cpu 2.593 total

                         Peak Memory
build/pacman -Fl         43.52MB
build/pacman -Fl pacmam  11.292MB

pacman -Fl               677.048MB
pacman -Fl pacman        163.288MB
---
 lib/libalpm/alpm.h     |  45 ++++++++-
 lib/libalpm/be_sync.c  | 182 ++++++++++++++++++++++++++++-------
 lib/libalpm/db.h       |   7 ++
 lib/libalpm/filelist.c |  14 +++
 lib/libalpm/filelist.h |   1 +
 lib/libalpm/package.c  |   1 +
 src/pacman/files.c     | 209 +++++++++++++++++++++++++----------------
 7 files changed, 344 insertions(+), 115 deletions(-)

diff --git a/lib/libalpm/alpm.h b/lib/libalpm/alpm.h
index c4acc062..595bd6ee 100644
--- a/lib/libalpm/alpm.h
+++ b/lib/libalpm/alpm.h
@@ -101,6 +101,11 @@ typedef struct __alpm_handle_t alpm_handle_t;
 typedef struct __alpm_db_t alpm_db_t;
 
 
+/** A Database file iterator
+ * @ingroup libalpm_databases
+ */
+typedef struct __alpm_db_files_t alpm_db_files_t;
+
 /** A package.
  *
  * A package can be loaded from disk via \link alpm_pkg_load \endlink or retrieved from a database.
@@ -164,6 +169,9 @@ typedef struct _alpm_backup_t {
  */
 alpm_file_t *alpm_filelist_contains(alpm_filelist_t *filelist, const char *path);
 
+/** Frees a file list */
+void  alpm_filelist_free(alpm_filelist_t *files);
+
 /* End of libalpm_files */
 /** @} */
 
@@ -1450,6 +1458,42 @@ int alpm_db_get_usage(alpm_db_t *db, int *usage);
 /* End of usage accessors */
 /** @} */
 
+/** @name File iterators
+ * @{
+ */
+
+/** Opens a handle to the db files iterator.
+ * @param db the db files to iterate over
+ * @return handle to the iterator
+ */
+alpm_db_files_t *alpm_db_files_open(alpm_db_t *db);
+
+/** Goes to the next package.
+ * @param files handle to the file iterator
+ * @param pkgname stores the pkgname of the current package
+ * @return 0 on success, 1 if end of iterator, -1 on error
+ */
+int alpm_db_files_next(alpm_db_files_t *files, char** pkgname);
+
+/** Loads the files for a package into a file list.
+ *
+ * This extends the file list as needed, reusing the memory alloced.
+ * You can reuse the same file list for calls to this function but
+ * the list should be freed with \link alpm_filelist_free alpm_filelist_free \endlink
+ * after use.
+ * @param files handle to the file iterator
+ * @param filelist the filelist to load files into
+ * @return 0 on success, -1 on error
+ */
+int alpm_db_files_load(alpm_db_files_t *files, alpm_filelist_t *filelist);
+
+/** Close the db file iterator
+ * @param files handle to the file iterator
+ */
+void alpm_db_files_close(alpm_db_files_t *files);
+
+/* End of file iterators */
+/** @} */
 
 /* End of libalpm_databases */
 /** @} */
@@ -2684,7 +2728,6 @@ int alpm_pkg_mtree_close(const alpm_pkg_t *pkg, struct archive *archive);
 /* End of mtree accessors */
 /** @} */
 
-
 /* End of libalpm_packages */
 /** @} */
 
diff --git a/lib/libalpm/be_sync.c b/lib/libalpm/be_sync.c
index d85f36ee..9e7f8085 100644
--- a/lib/libalpm/be_sync.c
+++ b/lib/libalpm/be_sync.c
@@ -566,8 +566,7 @@ static int sync_db_read(alpm_db_t *db, struct archive *archive,
 		return 0;
 	}
 
-	if(strcmp(filename, "desc") == 0 || strcmp(filename, "depends") == 0
-			|| strcmp(filename, "files") == 0) {
+	if(strcmp(filename, "desc") == 0 || strcmp(filename, "depends") == 0) {
 		int ret;
 		while((ret = _alpm_archive_fgets(archive, &buf)) == ARCHIVE_OK) {
 			char *line = buf.line;
@@ -636,36 +635,6 @@ static int sync_db_read(alpm_db_t *db, struct archive *archive,
 				READ_AND_SPLITDEP(pkg->conflicts);
 			} else if(strcmp(line, "%PROVIDES%") == 0) {
 				READ_AND_SPLITDEP(pkg->provides);
-			} else if(strcmp(line, "%FILES%") == 0) {
-				/* TODO: this could lazy load if there is future demand */
-				size_t files_count = 0, files_size = 0;
-				alpm_file_t *files = NULL;
-
-				while(1) {
-					if(_alpm_archive_fgets(archive, &buf) != ARCHIVE_OK) {
-						goto error;
-					}
-					line = buf.line;
-					if(_alpm_strip_newline(line, buf.real_line_size) == 0) {
-						break;
-					}
-
-					if(!_alpm_greedy_grow((void **)&files, &files_size,
-								(files_count ? (files_count + 1) * sizeof(alpm_file_t) : 8 * sizeof(alpm_file_t)))) {
-						goto error;
-					}
-					STRDUP(files[files_count].name, line, goto error);
-					files_count++;
-				}
-				/* attempt to hand back any memory we don't need */
-				if(files_count > 0) {
-					REALLOC(files, sizeof(alpm_file_t) * files_count, (void)0);
-				} else {
-					FREE(files);
-				}
-				pkg->files.count = files_count;
-				pkg->files.files = files;
-				_alpm_filelist_sort(&pkg->files);
 			}
 		}
 		if(ret != ARCHIVE_EOF) {
@@ -716,3 +685,152 @@ alpm_db_t *_alpm_db_register_sync(alpm_handle_t *handle, const char *treename,
 	handle->dbs_sync = alpm_list_add(handle->dbs_sync, db);
 	return db;
 }
+
+static int load_files(struct archive *archive, alpm_filelist_t *filelist)
+{
+	struct archive_read_buffer buf = {0};
+
+	/* 512K for a line length seems reasonable */
+	buf.max_line_size = 512 * 1024;
+
+	_alpm_filelist_truncate(filelist);
+
+	int ret;
+	while((ret = _alpm_archive_fgets(archive, &buf)) == ARCHIVE_OK) {
+		char *line = buf.line;
+		if(_alpm_strip_newline(line, buf.real_line_size) == 0) {
+			/* length of stripped line was zero */
+			continue;
+		}
+
+		if(strcmp(line, "%FILES%") == 0) {
+			size_t files_size = 0;
+
+			while(1) {
+				if(_alpm_archive_fgets(archive, &buf) != ARCHIVE_OK) {
+					goto error;
+				}
+				line = buf.line;
+				if(_alpm_strip_newline(line, buf.real_line_size) == 0) {
+					break;
+				}
+
+				if(!_alpm_greedy_grow((void **)&filelist->files, &files_size,
+							(filelist->count ? (filelist->count + 1) * sizeof(alpm_file_t) : 8 * sizeof(alpm_file_t)))) {
+					goto error;
+				}
+				STRDUP(filelist->files[filelist->count].name, line, goto error);
+				filelist->count++;
+			}
+			_alpm_filelist_sort(filelist);
+		}
+	}
+	if(ret != ARCHIVE_EOF) {
+		goto error;
+	}
+
+	return 0;
+
+error:
+	return -1;
+}
+
+alpm_db_files_t SYMEXPORT *alpm_db_files_open(alpm_db_t *db)
+{
+	const char *dbpath;
+	int fd;
+	struct stat buf;
+	struct archive *archive;
+	alpm_db_files_t *files = NULL;
+
+	ASSERT(db != NULL, return NULL);
+
+	dbpath = _alpm_db_path(db);
+	if(!dbpath) {
+		/* pm_errno set in _alpm_db_path() */
+		return NULL;
+	}
+
+	if(db->status & DB_STATUS_INVALID || db->status & DB_STATUS_MISSING) {
+		return NULL;
+	}
+
+	fd = _alpm_open_archive(db->handle, dbpath, &buf,
+			&archive, ALPM_ERR_DB_OPEN);
+	if(fd < 0) {
+		db->status &= ~DB_STATUS_VALID;
+		db->status |= DB_STATUS_INVALID;
+		_alpm_archive_read_free(archive);
+		return NULL;
+	}
+
+	MALLOC(files, sizeof(alpm_db_files_t), RET_ERR(db->handle, ALPM_ERR_MEMORY, NULL));
+	files->archive = archive;
+	files->fd = fd;
+	files->db = db;
+	return files;
+}
+
+int SYMEXPORT alpm_db_files_next(alpm_db_files_t *files, char** pkgname)
+{
+	struct archive_entry *entry;
+	const char *entryname;
+	int archive_ret;
+	char *filename;
+
+	ASSERT(files != NULL, return -1);
+	ASSERT(pkgname != NULL, return -1);
+
+	while((archive_ret = archive_read_next_header(files->archive, &entry)) == ARCHIVE_OK) {
+		mode_t mode = archive_entry_mode(entry);
+		if(!S_ISDIR(mode)) {
+			entryname = archive_entry_pathname(entry);
+			if(entryname == NULL) {
+				_alpm_log(files->db->handle, ALPM_LOG_DEBUG,
+						"invalid archive entry provided to alpm_db_files_next, skipping\n");
+				return -1;
+			}
+
+			if(_alpm_splitname(entryname, pkgname, NULL, NULL) != 0) {
+				_alpm_log(files->db->handle, ALPM_LOG_ERROR,
+						_("invalid name for database entry '%s'\n"), entryname);
+				return -1;
+			}
+
+			filename = strrchr(entryname, '/');
+			filename++;
+
+			/* we only want to read the file list */
+			if(filename && strcmp(filename, "files") == 0) {
+				return 0;
+			}
+		}
+	}
+	if(archive_ret != ARCHIVE_EOF) {
+		return -1;
+	}
+	return 1;
+}
+
+int SYMEXPORT alpm_db_files_load(alpm_db_files_t *files, alpm_filelist_t *filelist)
+{
+	ASSERT(files != NULL, return -1);
+	ASSERT(filelist != NULL, return -1);
+
+	_alpm_filelist_truncate(filelist);
+	if(load_files(files->archive, filelist) != 0) {
+		_alpm_log(files->db->handle, ALPM_LOG_ERROR,
+			_("could not parse package description file '%s' from db '%s'\n"),
+			"files", files->db->treename);
+		return -1;
+	}
+	return 0;
+}
+
+void SYMEXPORT alpm_db_files_close(alpm_db_files_t *files)
+{
+	ASSERT(files != NULL, return);
+	_alpm_archive_read_free(files->archive);
+	close(files->fd);
+	free(files);
+}
diff --git a/lib/libalpm/db.h b/lib/libalpm/db.h
index 92c69ba7..1e7b670a 100644
--- a/lib/libalpm/db.h
+++ b/lib/libalpm/db.h
@@ -61,6 +61,13 @@ struct db_operations {
 	void (*unregister) (alpm_db_t *);
 };
 
+/* Database files iterator */
+struct __alpm_db_files_t {
+	struct archive *archive;
+	int fd;
+	alpm_db_t *db;
+};
+
 /* Database */
 struct __alpm_db_t {
 	alpm_handle_t *handle;
diff --git a/lib/libalpm/filelist.c b/lib/libalpm/filelist.c
index 07239c35..8dccd4bb 100644
--- a/lib/libalpm/filelist.c
+++ b/lib/libalpm/filelist.c
@@ -145,3 +145,17 @@ void _alpm_filelist_sort(alpm_filelist_t *filelist)
 		}
 	}
 }
+
+void _alpm_filelist_truncate(alpm_filelist_t *files)
+{
+	for(size_t i = 0; i < files->count; i++) {
+		FREE(files->files[i].name);
+	}
+	files->count = 0;
+}
+
+void SYMEXPORT alpm_filelist_free(alpm_filelist_t *files)
+{
+	_alpm_filelist_truncate(files);
+	free(files->files);
+}
diff --git a/lib/libalpm/filelist.h b/lib/libalpm/filelist.h
index 928e3c1e..4bb70093 100644
--- a/lib/libalpm/filelist.h
+++ b/lib/libalpm/filelist.h
@@ -28,5 +28,6 @@ alpm_list_t *_alpm_filelist_intersection(alpm_filelist_t *filesA,
 		alpm_filelist_t *filesB);
 
 void _alpm_filelist_sort(alpm_filelist_t *filelist);
+void _alpm_filelist_truncate(alpm_filelist_t *filelist);
 
 #endif /* ALPM_FILELIST_H */
diff --git a/lib/libalpm/package.c b/lib/libalpm/package.c
index f837f84a..4f721797 100644
--- a/lib/libalpm/package.c
+++ b/lib/libalpm/package.c
@@ -99,6 +99,7 @@ static alpm_list_t *_pkg_get_replaces(alpm_pkg_t *pkg)   { return pkg->replaces;
 static alpm_filelist_t *_pkg_get_files(alpm_pkg_t *pkg)  { return &(pkg->files); }
 static alpm_list_t *_pkg_get_backup(alpm_pkg_t *pkg)     { return pkg->backup; }
 
+
 static void *_pkg_changelog_open(alpm_pkg_t UNUSED *pkg)
 {
 	return NULL;
diff --git a/src/pacman/files.c b/src/pacman/files.c
index 3801d735..5d39b074 100644
--- a/src/pacman/files.c
+++ b/src/pacman/files.c
@@ -40,9 +40,8 @@ static void print_line_machinereadable(alpm_db_t *db, alpm_pkg_t *pkg, char *fil
 	fputs("\n", stdout);
 }
 
-static void dump_pkg_machinereadable(alpm_db_t *db, alpm_pkg_t *pkg)
+static void dump_pkg_machinereadable(alpm_db_t *db, alpm_pkg_t *pkg, alpm_filelist_t *pkgfiles)
 {
-	alpm_filelist_t *pkgfiles = alpm_pkg_get_files(pkg);
 	for(size_t filenum = 0; filenum < pkgfiles->count; filenum++) {
 		const alpm_file_t *file = pkgfiles->files + filenum;
 		print_line_machinereadable(db, pkg, file->name);
@@ -108,7 +107,9 @@ static void filetarget_free(struct filetarget *ftarg) {
 
 static int files_search(alpm_list_t *syncs, alpm_list_t *targets, int regex) {
 	int ret = 0;
-	alpm_list_t *t, *filetargs = NULL;
+	alpm_list_t *t, *s,  *filetargs = NULL;
+	alpm_filelist_t filelist = {0};
+	char *pkgname = NULL;
 
 	for(t = targets; t; t = alpm_list_next(t)) {
 		char *targ = t->data;
@@ -144,43 +145,58 @@ static int files_search(alpm_list_t *syncs, alpm_list_t *targets, int regex) {
 		goto cleanup;
 	}
 
-	for(t = filetargs; t; t = alpm_list_next(t)) {
-		struct filetarget *ftarg = t->data;
-		char *targ = ftarg->targ;
-		regex_t *reg = &ftarg->reg;
-		int exact_file = ftarg->exact_file;
-		alpm_list_t *s;
-		int found = 0;
-
-		for(s = syncs; s; s = alpm_list_next(s)) {
-			alpm_list_t *p;
-			alpm_db_t *repo = s->data;
-			alpm_list_t *packages = alpm_db_get_pkgcache(repo);
-			int m;
-
-			for(p = packages; p; p = alpm_list_next(p)) {
-				alpm_pkg_t *pkg = p->data;
-				alpm_filelist_t *files = alpm_pkg_get_files(pkg);
+	for(s = syncs; s; s = alpm_list_next(s)) {
+		alpm_db_t *repo = s->data;
+		int m;
+
+		alpm_db_files_t *files = alpm_db_files_open(repo);
+
+		if(!files) {
+			continue;
+		}
+
+		while(1) {
+			int ok = alpm_db_files_next(files, &pkgname);
+			if(ok == 1) {
+				break;
+			}
+			if(ok != 0) {
+				continue;
+			}
+
+			if(alpm_db_files_load(files, &filelist) != 0) {
+				ret = 1;
+				continue;
+			}
+
+			alpm_pkg_t *pkg = alpm_db_get_pkg(repo, pkgname);
+
+			for(t = filetargs; t; t = alpm_list_next(t)) {
+				struct filetarget *ftarg = t->data;
+				char *targ = ftarg->targ;
+				regex_t *reg = &ftarg->reg;
+				int exact_file = ftarg->exact_file;
+				int found = 0;
 				alpm_list_t *match = NULL;
 
 				if(exact_file) {
-					if (regex) {
-						for(size_t f = 0; f < files->count; f++) {
-							char *c = files->files[f].name;
+					if(regex) {
+						for(size_t f = 0; f < filelist.count; f++) {
+							char *c = filelist.files[f].name;
 							if(regexec(reg, c, 0, 0, 0) == 0) {
-								match = alpm_list_add(match, files->files[f].name);
+								match = alpm_list_add(match, filelist.files[f].name);
 								found = 1;
 							}
 						}
 					} else {
-						if(alpm_filelist_contains(files, targ)) {
+						if(alpm_filelist_contains(&filelist, targ)) {
 							match = alpm_list_add(match, targ);
 							found = 1;
 						}
 					}
 				} else {
-					for(size_t f = 0; f < files->count; f++) {
-						char *c = strrchr(files->files[f].name, '/');
+					for(size_t f = 0; f < filelist.count; f++) {
+						char *c = strrchr(filelist.files[f].name, '/');
 						if(c && *(c + 1)) {
 							if(regex) {
 								m = regexec(reg, (c + 1), 0, 0, 0);
@@ -188,7 +204,7 @@ static int files_search(alpm_list_t *syncs, alpm_list_t *targets, int regex) {
 								m = strcmp(c + 1, targ);
 							}
 							if(m == 0) {
-								match = alpm_list_add(match, files->files[f].name);
+								match = alpm_list_add(match, filelist.files[f].name);
 								found = 1;
 							}
 						}
@@ -199,28 +215,33 @@ static int files_search(alpm_list_t *syncs, alpm_list_t *targets, int regex) {
 					print_match(match, repo, pkg, exact_file);
 					alpm_list_free(match);
 				}
+
+				if(!found) {
+					ret = 1;
+				}
 			}
 		}
 
-		if(!found) {
-			ret = 1;
-		}
+		alpm_db_files_close(files);
 	}
 
 cleanup:
 	alpm_list_free_inner(filetargs, (alpm_list_fn_free) filetarget_free);
 	alpm_list_free(filetargs);
+	alpm_filelist_free(&filelist);
+
+	if(pkgname) {
+		free(pkgname);
+	}
 
 	return ret;
 }
 
-static void dump_file_list(alpm_pkg_t *pkg) {
+static void dump_file_list(alpm_pkg_t *pkg, alpm_filelist_t *pkgfiles) {
 	const char *pkgname;
-	alpm_filelist_t *pkgfiles;
 	size_t i;
 
 	pkgname = alpm_pkg_get_name(pkg);
-	pkgfiles = alpm_pkg_get_files(pkg);
 
 	for(i = 0; i < pkgfiles->count; i++) {
 		const alpm_file_t *file = pkgfiles->files + i;
@@ -239,73 +260,97 @@ static void dump_file_list(alpm_pkg_t *pkg) {
 static int files_list(alpm_list_t *syncs, alpm_list_t *targets) {
 	alpm_list_t *i, *j;
 	int ret = 0;
+	size_t found = 0;
+	alpm_filelist_t filelist = {0};
+	char *pkgname = NULL;
 
-	if(targets != NULL) {
-		for(i = targets; i; i = alpm_list_next(i)) {
-			int found = 0;
-			char *targ = i->data;
-			char *repo = NULL;
-			char *c = strchr(targ, '/');
-
-			if(c) {
-				if(! *(c + 1)) {
-					pm_printf(ALPM_LOG_ERROR,
-						_("invalid package: '%s'\n"), targ);
-					ret += 1;
-					continue;
-				}
+	for(j = syncs; j; j = alpm_list_next(j)) {
+		alpm_db_t *db = j->data;
+		alpm_db_files_t *files = alpm_db_files_open(db);
+
+		if(!files) {
+			continue;
+		}
 
-				repo = strndup(targ, c - targ);
-				targ = c + 1;
+		while(1) {
+			int ok = alpm_db_files_next(files, &pkgname);
+			if(ok == 1) {
+				break;
+			}
+			if(ok != 0) {
+				continue;
 			}
 
-			for(j = syncs; j; j = alpm_list_next(j)) {
-				alpm_pkg_t *pkg;
-				alpm_db_t *db = j->data;
+			if(targets != NULL) {
+				int match = 0;
+				for(i = targets; i; i = alpm_list_next(i)) {
+					char *targ =  i->data;
+					char *c = strchr(targ, '/');
+					char *repo = NULL;
+
+					if(c) {
+						if(! *(c + 1)) {
+							pm_printf(ALPM_LOG_ERROR,
+								_("invalid package: '%s'\n"), targ);
+							ret = 1;
+							continue;
+						}
 
-				if(repo) {
-					if(strcmp(alpm_db_get_name(db), repo) != 0) {
-						continue;
+						repo = strndup(targ, c - targ);
+						targ = c + 1;
 					}
-				}
 
-				if((pkg = alpm_db_get_pkg(db, targ)) != NULL) {
-					found = 1;
-					if(config->op_f_machinereadable) {
-						dump_pkg_machinereadable(db, pkg);
-					} else {
-						dump_file_list(pkg);
+					if(repo) {
+						if(strcmp(alpm_db_get_name(db), repo) != 0) {
+							free(repo);
+							continue;
+						}
+						free(repo);
 					}
-					break;
+
+					if(strcmp(pkgname, targ) == 0) {
+						match = 1;
+						found++;
+						break;
+					}
+				}
+
+				if(!match) {
+					continue;
 				}
 			}
-			if(!found) {
-				targ = i->data;
-				pm_printf(ALPM_LOG_ERROR,
-						_("package '%s' was not found\n"), targ);
-				ret += 1;
+
+
+			if(alpm_db_files_load(files, &filelist) != 0) {
+				ret = 1;
+				continue;
 			}
-			free(repo);
-		}
-	} else {
-		for(i = syncs; i; i = alpm_list_next(i)) {
-		alpm_db_t *db = i->data;
 
-			for(j = alpm_db_get_pkgcache(db); j; j = alpm_list_next(j)) {
-				alpm_pkg_t *pkg = j->data;
-				if(config->op_f_machinereadable) {
-					dump_pkg_machinereadable(db, pkg);
-				} else {
-					dump_file_list(pkg);
-				}
+			alpm_pkg_t *pkg = alpm_db_get_pkg(db, pkgname);
+
+			if(config->op_f_machinereadable) {
+				dump_pkg_machinereadable(db, pkg, &filelist);
+			} else {
+				dump_file_list(pkg, &filelist);
 			}
+			break;
 		}
+		alpm_db_files_close(files);
+	}
+
+	alpm_filelist_free(&filelist);
+
+	if(found != alpm_list_count(targets)) {
+		ret = 1;
+	}
+
+	if(pkgname) {
+		free(pkgname);
 	}
 
 	return ret;
 }
 
-
 int pacman_files(alpm_list_t *targets)
 {
 	alpm_list_t *files_dbs = NULL;
-- 
2.31.1


More information about the pacman-dev mailing list