[pacman-dev] [PATCH] Add an archive_fgets() function
This crude function allows reading from an archive on a line-by-line basis similar to the familiar fgets() call on a FILE stream. This is the first step in being able to read DB entries straight from an archive. Signed-off-by: Dan McGee <dan@archlinux.org> --- lib/libalpm/util.c | 29 +++++++++++++++++++++++++++++ lib/libalpm/util.h | 2 ++ 2 files changed, 31 insertions(+), 0 deletions(-) diff --git a/lib/libalpm/util.c b/lib/libalpm/util.c index 806f601..92a0a26 100644 --- a/lib/libalpm/util.c +++ b/lib/libalpm/util.c @@ -673,4 +673,33 @@ int _alpm_test_md5sum(const char *filepath, const char *md5sum) return(ret); } +char *_alpm_archive_fgets(char *line, size_t size, struct archive *a) +{ + /* for now, just read one char at a time until we get to a + * '\n' char. we can optimize this later with an internal + * buffer. */ + /* leave room for zero terminator */ + int want = size - 1; + int loop; + + for(loop = 0; loop < want; loop++) { + int ret = archive_read_data(a, line + loop, 1); + /* special check for first read- if null, return null, + * this indicates EOF */ + if(loop == 0 && (ret <= 0 || line[loop] == '\0')) { + return(NULL); + } + /* check if read value was null or newline */ + if(ret <= 0 || line[loop] == '\0' || line[loop] == '\n') { + want = loop + 1; + break; + } + } + + /* always null terminate the buffer */ + line[want] = '\0'; + + return(line); +} + /* vim: set ts=2 sw=2 noet: */ diff --git a/lib/libalpm/util.h b/lib/libalpm/util.h index 6849fa1..0bf122c 100644 --- a/lib/libalpm/util.h +++ b/lib/libalpm/util.h @@ -30,6 +30,7 @@ #include <stdarg.h> #include <time.h> #include <sys/stat.h> /* struct stat */ +#include <archive.h> /* struct archive */ #ifdef ENABLE_NLS #include <libintl.h> /* here so it doesn't need to be included elsewhere */ @@ -66,6 +67,7 @@ char *_alpm_filecache_find(const char *filename); const char *_alpm_filecache_setup(void); int _alpm_lstat(const char *path, struct stat *buf); int _alpm_test_md5sum(const char *filepath, const char *md5sum); +char *_alpm_archive_fgets(char *line, size_t size, struct archive *a); #ifndef HAVE_STRVERSCMP int strverscmp(const char *, const char *); -- 1.5.4.4
With the addition of the archive_fgets() function, we can now skip the temp file usage in pkg_load/parse_descfile that was not needed. This has a nice benefit of probably being both faster, reducing code, and getting rid of "expensive" file operations. Signed-off-by: Dan McGee <dan@archlinux.org> --- lib/libalpm/package.c | 61 +++++++++---------------------------------------- 1 files changed, 11 insertions(+), 50 deletions(-) diff --git a/lib/libalpm/package.c b/lib/libalpm/package.c index dbe3c98..587e7fc 100644 --- a/lib/libalpm/package.c +++ b/lib/libalpm/package.c @@ -889,15 +889,12 @@ int _alpm_pkgname_pkg_cmp(const void *pkgname, const void *package) } -/* Parses the package description file for the current package - * TODO: this should ALL be in a backend interface (be_files), we should - * be dealing with the abstracted concepts only in this file +/* Parses the package description file for the current package. This + * is handed the struct archive when the .PKGINFO file is open. * Returns: 0 on success, 1 on error - * */ -static int parse_descfile(const char *descfile, pmpkg_t *info) +static int parse_descfile(struct archive *a, pmpkg_t *info) { - FILE* fp = NULL; char line[PATH_MAX]; char *ptr = NULL; char *key = NULL; @@ -905,13 +902,8 @@ static int parse_descfile(const char *descfile, pmpkg_t *info) ALPM_LOG_FUNC; - if((fp = fopen(descfile, "r")) == NULL) { - _alpm_log(PM_LOG_ERROR, _("could not open file %s: %s\n"), descfile, strerror(errno)); - return(-1); - } - - while(!feof(fp)) { - fgets(line, PATH_MAX, fp); + /* loop until we reach EOF (where archive_fgets will return NULL) */ + while(_alpm_archive_fgets(line, PATH_MAX, a) != NULL) { linenum++; _alpm_strtrim(line); if(strlen(line) == 0 || line[0] == '#') { @@ -921,7 +913,7 @@ static int parse_descfile(const char *descfile, pmpkg_t *info) key = strsep(&ptr, "="); if(key == NULL || ptr == NULL) { _alpm_log(PM_LOG_DEBUG, "%s: syntax error in description file line %d\n", - info->name[0] != '\0' ? info->name : "error", linenum); + info->name ? info->name : "error", linenum); } else { key = _alpm_strtrim(key); ptr = _alpm_strtrim(ptr); @@ -940,7 +932,7 @@ static int parse_descfile(const char *descfile, pmpkg_t *info) } else if(!strcmp(key, "builddate")) { char first = tolower(ptr[0]); if(first > 'a' && first < 'z') { - struct tm tmp_tm = {0}; //initialize to null incase of failure + struct tm tmp_tm = {0}; //initialize to null in case of failure setlocale(LC_TIME, "C"); strptime(ptr, "%a %b %e %H:%M:%S %Y", &tmp_tm); info->builddate = mktime(&tmp_tm); @@ -970,13 +962,11 @@ static int parse_descfile(const char *descfile, pmpkg_t *info) info->backup = alpm_list_add(info->backup, strdup(ptr)); } else { _alpm_log(PM_LOG_DEBUG, "%s: syntax error in description file line %d\n", - info->name[0] != '\0' ? info->name : "error", linenum); + info->name ? info->name : "error", linenum); } } line[0] = '\0'; } - fclose(fp); - unlink(descfile); return(0); } @@ -996,8 +986,6 @@ pmpkg_t *_alpm_pkg_load(const char *pkgfile, unsigned short full) struct archive *archive; struct archive_entry *entry; pmpkg_t *info = NULL; - char *descfile = NULL; - int fd = -1; struct stat st; ALPM_LOG_FUNC; @@ -1028,32 +1016,15 @@ pmpkg_t *_alpm_pkg_load(const char *pkgfile, unsigned short full) info->size = st.st_size; } - /* TODO there is no reason to make temp files to read - * from a libarchive archive, it can be done by reading - * directly from the archive - * See: archive_read_data_into_buffer - * requires changes 'parse_descfile' as well - * */ - /* If full is false, only read through the archive until we find our needed * metadata. If it is true, read through the entire archive, which serves * as a verfication of integrity and allows us to create the filelist. */ while((ret = archive_read_next_header(archive, &entry)) == ARCHIVE_OK) { const char *entry_name = archive_entry_pathname(entry); - /* NOTE: we used to look for .FILELIST, but it is easier (and safer) for - * us to just generate this on our own. */ if(strcmp(entry_name, ".PKGINFO") == 0) { - /* extract this file into /tmp. it has info for us */ - descfile = strdup("/tmp/alpm_XXXXXX"); - fd = mkstemp(descfile); - if(archive_read_data_into_fd(archive, fd) != ARCHIVE_OK) { - _alpm_log(PM_LOG_ERROR, _("error extracting package description file to %s\n"), - descfile); - goto pkg_invalid; - } /* parse the info file */ - if(parse_descfile(descfile, info) == -1) { + if(parse_descfile(archive, info) != 0) { _alpm_log(PM_LOG_ERROR, _("could not parse package description file in %s\n"), pkgfile); goto pkg_invalid; @@ -1067,9 +1038,6 @@ pmpkg_t *_alpm_pkg_load(const char *pkgfile, unsigned short full) goto pkg_invalid; } config = 1; - unlink(descfile); - FREE(descfile); - close(fd); continue; } else if(strcmp(entry_name, ".INSTALL") == 0) { info->scriptlet = 1; @@ -1113,13 +1081,13 @@ pmpkg_t *_alpm_pkg_load(const char *pkgfile, unsigned short full) info->origin_data.file = strdup(pkgfile); if(full) { - /* "checking for conflicts" requires a sorted list, so we ensure that here */ + /* "checking for conflicts" requires a sorted list, ensure that here */ _alpm_log(PM_LOG_DEBUG, "sorting package filelist for %s\n", pkgfile); info->files = alpm_list_msort(info->files, alpm_list_count(info->files), _alpm_str_cmp); info->infolevel = INFRQ_ALL; } else { - /* get rid of any partial filelist we may have collected, as it is invalid */ + /* get rid of any partial filelist we may have collected, it is invalid */ FREELIST(info->files); info->infolevel = INFRQ_BASE | INFRQ_DESC | INFRQ_DEPENDS; } @@ -1128,13 +1096,6 @@ pmpkg_t *_alpm_pkg_load(const char *pkgfile, unsigned short full) pkg_invalid: pm_errno = PM_ERR_PKG_INVALID; - if(descfile) { - unlink(descfile); - FREE(descfile); - } - if(fd != -1) { - close(fd); - } error: _alpm_pkg_free(info); archive_read_finish(archive); -- 1.5.4.4
Dan McGee wrote:
With the addition of the archive_fgets() function, we can now skip the temp file usage in pkg_load/parse_descfile that was not needed. This has a nice benefit of probably being both faster, reducing code, and getting rid of "expensive" file operations.
Signed-off-by: Dan McGee <dan@archlinux.org> --- lib/libalpm/package.c | 61 +++++++++---------------------------------------- 1 files changed, 11 insertions(+), 50 deletions(-)
That looks great, nice job.
On Thu, Mar 27, 2008 at 4:37 AM, Xavier <shiningxc@gmail.com> wrote:
Dan McGee wrote:
With the addition of the archive_fgets() function, we can now skip the temp file usage in pkg_load/parse_descfile that was not needed. This has a nice benefit of probably being both faster, reducing code, and getting rid of "expensive" file operations.
Signed-off-by: Dan McGee <dan@archlinux.org> --- lib/libalpm/package.c | 61 +++++++++---------------------------------------- 1 files changed, 11 insertions(+), 50 deletions(-)
That looks great, nice job.
Pfft, don't pat him on the back TOO much. A lot of those deletions were comments... cheater! 8)
Aaron Griffin wrote:
On Thu, Mar 27, 2008 at 4:37 AM, Xavier <shiningxc@gmail.com> wrote:
Dan McGee wrote:
With the addition of the archive_fgets() function, we can now skip the temp file usage in pkg_load/parse_descfile that was not needed. This has a nice benefit of probably being both faster, reducing code, and getting rid of "expensive" file operations.
Signed-off-by: Dan McGee <dan@archlinux.org> --- lib/libalpm/package.c | 61 +++++++++---------------------------------------- 1 files changed, 11 insertions(+), 50 deletions(-)
That looks great, nice job.
Pfft, don't pat him on the back TOO much. A lot of those deletions were comments... cheater! 8)
Ahah, well not that much, there were quite a few lines for all these pointless file operations. I find the code nicer without them, and it's also one TODO less in the code. :)
Interesting. I was in the process of reading the whole file into a buffer and having parse_descfile parse that. Oh well... k On Wed, 2008-03-26 at 21:41 -0500, Dan McGee wrote:
With the addition of the archive_fgets() function, we can now skip the temp file usage in pkg_load/parse_descfile that was not needed. This has a nice benefit of probably being both faster, reducing code, and getting rid of "expensive" file operations.
Signed-off-by: Dan McGee <dan@archlinux.org> --- lib/libalpm/package.c | 61 +++++++++---------------------------------------- 1 files changed, 11 insertions(+), 50 deletions(-)
diff --git a/lib/libalpm/package.c b/lib/libalpm/package.c index dbe3c98..587e7fc 100644 --- a/lib/libalpm/package.c +++ b/lib/libalpm/package.c @@ -889,15 +889,12 @@ int _alpm_pkgname_pkg_cmp(const void *pkgname, const void *package) }
-/* Parses the package description file for the current package - * TODO: this should ALL be in a backend interface (be_files), we should - * be dealing with the abstracted concepts only in this file +/* Parses the package description file for the current package. This + * is handed the struct archive when the .PKGINFO file is open. * Returns: 0 on success, 1 on error - * */ -static int parse_descfile(const char *descfile, pmpkg_t *info) +static int parse_descfile(struct archive *a, pmpkg_t *info) { - FILE* fp = NULL; char line[PATH_MAX]; char *ptr = NULL; char *key = NULL; @@ -905,13 +902,8 @@ static int parse_descfile(const char *descfile, pmpkg_t *info)
ALPM_LOG_FUNC;
- if((fp = fopen(descfile, "r")) == NULL) { - _alpm_log(PM_LOG_ERROR, _("could not open file %s: %s\n"), descfile, strerror(errno)); - return(-1); - } - - while(!feof(fp)) { - fgets(line, PATH_MAX, fp); + /* loop until we reach EOF (where archive_fgets will return NULL) */ + while(_alpm_archive_fgets(line, PATH_MAX, a) != NULL) { linenum++; _alpm_strtrim(line); if(strlen(line) == 0 || line[0] == '#') { @@ -921,7 +913,7 @@ static int parse_descfile(const char *descfile, pmpkg_t *info) key = strsep(&ptr, "="); if(key == NULL || ptr == NULL) { _alpm_log(PM_LOG_DEBUG, "%s: syntax error in description file line %d\n", - info->name[0] != '\0' ? info->name : "error", linenum); + info->name ? info->name : "error", linenum); } else { key = _alpm_strtrim(key); ptr = _alpm_strtrim(ptr); @@ -940,7 +932,7 @@ static int parse_descfile(const char *descfile, pmpkg_t *info) } else if(!strcmp(key, "builddate")) { char first = tolower(ptr[0]); if(first > 'a' && first < 'z') { - struct tm tmp_tm = {0}; //initialize to null incase of failure + struct tm tmp_tm = {0}; //initialize to null in case of failure setlocale(LC_TIME, "C"); strptime(ptr, "%a %b %e %H:%M:%S %Y", &tmp_tm); info->builddate = mktime(&tmp_tm); @@ -970,13 +962,11 @@ static int parse_descfile(const char *descfile, pmpkg_t *info) info->backup = alpm_list_add(info->backup, strdup(ptr)); } else { _alpm_log(PM_LOG_DEBUG, "%s: syntax error in description file line %d\n", - info->name[0] != '\0' ? info->name : "error", linenum); + info->name ? info->name : "error", linenum); } } line[0] = '\0'; } - fclose(fp); - unlink(descfile);
return(0); } @@ -996,8 +986,6 @@ pmpkg_t *_alpm_pkg_load(const char *pkgfile, unsigned short full) struct archive *archive; struct archive_entry *entry; pmpkg_t *info = NULL; - char *descfile = NULL; - int fd = -1; struct stat st;
ALPM_LOG_FUNC; @@ -1028,32 +1016,15 @@ pmpkg_t *_alpm_pkg_load(const char *pkgfile, unsigned short full) info->size = st.st_size; }
- /* TODO there is no reason to make temp files to read - * from a libarchive archive, it can be done by reading - * directly from the archive - * See: archive_read_data_into_buffer - * requires changes 'parse_descfile' as well - * */ - /* If full is false, only read through the archive until we find our needed * metadata. If it is true, read through the entire archive, which serves * as a verfication of integrity and allows us to create the filelist. */ while((ret = archive_read_next_header(archive, &entry)) == ARCHIVE_OK) { const char *entry_name = archive_entry_pathname(entry);
- /* NOTE: we used to look for .FILELIST, but it is easier (and safer) for - * us to just generate this on our own. */ if(strcmp(entry_name, ".PKGINFO") == 0) { - /* extract this file into /tmp. it has info for us */ - descfile = strdup("/tmp/alpm_XXXXXX"); - fd = mkstemp(descfile); - if(archive_read_data_into_fd(archive, fd) != ARCHIVE_OK) { - _alpm_log(PM_LOG_ERROR, _("error extracting package description file to %s\n"), - descfile); - goto pkg_invalid; - } /* parse the info file */ - if(parse_descfile(descfile, info) == -1) { + if(parse_descfile(archive, info) != 0) { _alpm_log(PM_LOG_ERROR, _("could not parse package description file in %s\n"), pkgfile); goto pkg_invalid; @@ -1067,9 +1038,6 @@ pmpkg_t *_alpm_pkg_load(const char *pkgfile, unsigned short full) goto pkg_invalid; } config = 1; - unlink(descfile); - FREE(descfile); - close(fd); continue; } else if(strcmp(entry_name, ".INSTALL") == 0) { info->scriptlet = 1; @@ -1113,13 +1081,13 @@ pmpkg_t *_alpm_pkg_load(const char *pkgfile, unsigned short full) info->origin_data.file = strdup(pkgfile);
if(full) { - /* "checking for conflicts" requires a sorted list, so we ensure that here */ + /* "checking for conflicts" requires a sorted list, ensure that here */ _alpm_log(PM_LOG_DEBUG, "sorting package filelist for %s\n", pkgfile); info->files = alpm_list_msort(info->files, alpm_list_count(info->files), _alpm_str_cmp); info->infolevel = INFRQ_ALL; } else { - /* get rid of any partial filelist we may have collected, as it is invalid */ + /* get rid of any partial filelist we may have collected, it is invalid */ FREELIST(info->files); info->infolevel = INFRQ_BASE | INFRQ_DESC | INFRQ_DEPENDS; } @@ -1128,13 +1096,6 @@ pmpkg_t *_alpm_pkg_load(const char *pkgfile, unsigned short full)
pkg_invalid: pm_errno = PM_ERR_PKG_INVALID; - if(descfile) { - unlink(descfile); - FREE(descfile); - } - if(fd != -1) { - close(fd); - } error: _alpm_pkg_free(info); archive_read_finish(archive); -- K. Piche <kpiche@rogers.com>
On Sat, Mar 29, 2008 at 2:36 PM, K. Piche <kpiche@rogers.com> wrote:
Interesting. I was in the process of reading the whole file into a buffer and having parse_descfile parse that. Oh well...
That is quite similar to Xavier's patch that keeps an internal buffer in archive_fgets() in order to reduce calls into libarchive, which seems to cut the time by half to parse the entire community DB (.12 vs .07 seconds locally on my desktop machine). No matter how you cut it, not writing a temp file definitely saves time. -Dan
On Sat, 2008-03-29 at 14:39 -0500, Dan McGee wrote:
On Sat, Mar 29, 2008 at 2:36 PM, K. Piche <kpiche@rogers.com> wrote:
Interesting. I was in the process of reading the whole file into a buffer and having parse_descfile parse that. Oh well...
That is quite similar to Xavier's patch that keeps an internal buffer in archive_fgets() in order to reduce calls into libarchive, which seems to cut the time by half to parse the entire community DB (.12 vs .07 seconds locally on my desktop machine).
True. We know how large the file is during the archive read from the entity struct so I figured it would be easier to malloc a buffer and pop the whole file in it. Parsing the buffer it would have required strdup'ing lines but no code would be required for overrunning a line buffer - all sizes are known. As an aside: is there any interest in some unit testing code? I've looked at the check library and written a few tests for the alpm_list stuff. Unfortunately a lot of our functions are difficult to test individually. For instance instead of passing a buffer with test data to test parse_descfile you need to pass an archive.
No matter how you cut it, not writing a temp file definitely saves time.
-Dan
_______________________________________________ pacman-dev mailing list pacman-dev@archlinux.org http://archlinux.org/mailman/listinfo/pacman-dev -- K. Piche <kpiche@rogers.com>
Signed-off-by: Chantry Xavier <shiningxc@gmail.com> --- Well, I saw that comment, so I gave it a try, but I am not sure if I did it correctly. It doesn't seem too bad but maybe there are still cleaner ways. Also I wasn't sure which buf size to choose, and how to comment the code. Xav lib/libalpm/util.c | 30 ++++++++++++++++++++++++------ 1 files changed, 24 insertions(+), 6 deletions(-) diff --git a/lib/libalpm/util.c b/lib/libalpm/util.c index 92a0a26..0b7750e 100644 --- a/lib/libalpm/util.c +++ b/lib/libalpm/util.c @@ -675,22 +675,40 @@ int _alpm_test_md5sum(const char *filepath, const char *md5sum) char *_alpm_archive_fgets(char *line, size_t size, struct archive *a) { - /* for now, just read one char at a time until we get to a - * '\n' char. we can optimize this later with an internal - * buffer. */ + /* internal buffer */ + static char buf[BUFSIZ]; + /* index of the next char to read in the buffer */ + static int index = 0; + /* total number of char in the buffer */ + static int nread = 0; /* leave room for zero terminator */ int want = size - 1; int loop; for(loop = 0; loop < want; loop++) { - int ret = archive_read_data(a, line + loop, 1); + if(index == 0) { + /* fill the buffer */ + nread = archive_read_data(a, buf, BUFSIZ); + } + if(index < nread) { + /* get the next char from the buffer */ + line[loop] = buf[index++]; + } else { + /* nothing left to read */ + line[loop] = '\0'; + index = 0; + } + if(index == BUFSIZ) { + /* end of the buffer, so reset the index to refill it */ + index = 0; + } /* special check for first read- if null, return null, * this indicates EOF */ - if(loop == 0 && (ret <= 0 || line[loop] == '\0')) { + if(loop == 0 && line[loop] == '\0') { return(NULL); } /* check if read value was null or newline */ - if(ret <= 0 || line[loop] == '\0' || line[loop] == '\n') { + if(line[loop] == '\0' || line[loop] == '\n') { want = loop + 1; break; } -- 1.5.4.4
I haven't tried this yet, but you beat me to my own suggestion, well done. :) On Thu, Mar 27, 2008 at 6:13 AM, Chantry Xavier <shiningxc@gmail.com> wrote:
Signed-off-by: Chantry Xavier <shiningxc@gmail.com> ---
Well, I saw that comment, so I gave it a try, but I am not sure if I did it correctly. It doesn't seem too bad but maybe there are still cleaner ways. Also I wasn't sure which buf size to choose, and how to comment the code.
Xav
lib/libalpm/util.c | 30 ++++++++++++++++++++++++------ 1 files changed, 24 insertions(+), 6 deletions(-)
diff --git a/lib/libalpm/util.c b/lib/libalpm/util.c index 92a0a26..0b7750e 100644 --- a/lib/libalpm/util.c +++ b/lib/libalpm/util.c @@ -675,22 +675,40 @@ int _alpm_test_md5sum(const char *filepath, const char *md5sum)
char *_alpm_archive_fgets(char *line, size_t size, struct archive *a) { - /* for now, just read one char at a time until we get to a - * '\n' char. we can optimize this later with an internal - * buffer. */ + /* internal buffer */ + static char buf[BUFSIZ]; + /* index of the next char to read in the buffer */ + static int index = 0; + /* total number of char in the buffer */ + static int nread = 0; Can't index just be a char* pointer instead? That should make things a bit simpler below. if(index == &buf), for example, and line[loop] = index++;
/* leave room for zero terminator */ int want = size - 1; int loop;
for(loop = 0; loop < want; loop++) { - int ret = archive_read_data(a, line + loop, 1); + if(index == 0) { + /* fill the buffer */ + nread = archive_read_data(a, buf, BUFSIZ); + } + if(index < nread) { + /* get the next char from the buffer */ + line[loop] = buf[index++]; + } else { + /* nothing left to read */ + line[loop] = '\0'; + index = 0; + } + if(index == BUFSIZ) { + /* end of the buffer, so reset the index to refill it */ + index = 0; + }
Unless I'm mistaken, this should correctly handle the buffer running out in the middle of a fgets call. The best way to test your code here might be to set the buffer to something very small such as 4, and see what happens. This will also ensure you never overrun your buffer, etc.
/* special check for first read- if null, return null, * this indicates EOF */ - if(loop == 0 && (ret <= 0 || line[loop] == '\0')) { + if(loop == 0 && line[loop] == '\0') { return(NULL); } /* check if read value was null or newline */ - if(ret <= 0 || line[loop] == '\0' || line[loop] == '\n') { + if(line[loop] == '\0' || line[loop] == '\n') { want = loop + 1; break; } -- 1.5.4.4
The big issues: 1. Once you use archive_fgets, you can't do a manual archive_read* call since data is buffered here. This should get noted in a comment. 2. I can't call this function with one archive followed by a call with another archive. This would obviously cause issues, and we should probably find a way to address this. We might need a helper struct that the caller has to pass in containing anything static so this function is thread-safe, etc. 3. This can be optimized even more by removing the 1 char at a time loop and making good use of memchr and memcpy. This is what I originally planned to do when I got around to making this a buffered implementation. Good work though, glad to see someone else is interested in making improvements here, and this is a large component of the read directly from db.tar.gz files, although you can see from my other patch it can already be put to use elsewhere in the code. -Dan
participants (6)
-
Aaron Griffin
-
Chantry Xavier
-
Dan McGee
-
Dan McGee
-
K. Piche
-
Xavier