[pacman-dev] [PATCH 1/4] dload: handle irregular URLs
Dan McGee
dpmcgee at gmail.com
Sun Jul 3 14:56:05 EDT 2011
On Fri, Jul 1, 2011 at 7:59 AM, Dave Reisner <d at falconindy.com> wrote:
> From: Dave Reisner <d at falconindy.com>
>
> URLs might end with a slash and follow redirects, or could be a
> generated by a script such as /getpkg.php?id=12345. In both cases, we
> may have a better filename that we can write to, taken from either
> content-disposition header, or the effective URL.
>
> Specific to the first case, we write to a temporary file of the format
> 'alpmtmp.XXXXXX', where XXXXXX is randomized by mkstemp(3). Since this
> is a randomly generated file, we cannot support resuming and the file is
> unlinked in the event of an interrupt.
>
> We also run into the possibility of changing out the filename from under
> alpm on a -U operation, so callers of _alpm_download can optionally pass
> a pointer to a *char to be filled in by curl_download_internal with the
> actual filename we wrote to. Any sync operation will pass a NULL pointer
> here, as we rely on specific names for packages from a mirror.
>
> Fixes FS#22645.
>
> Signed-off-by: Dave Reisner <d at falconindy.com>
> ---
> lib/libalpm/be_sync.c | 4 +-
> lib/libalpm/dload.c | 129 +++++++++++++++++++++++++++++++++++++++++--------
> lib/libalpm/dload.h | 3 +-
> lib/libalpm/sync.c | 2 +-
> 4 files changed, 114 insertions(+), 24 deletions(-)
>
> diff --git a/lib/libalpm/be_sync.c b/lib/libalpm/be_sync.c
> index ce418d5..16351f8 100644
> --- a/lib/libalpm/be_sync.c
> +++ b/lib/libalpm/be_sync.c
> @@ -184,7 +184,7 @@ int SYMEXPORT alpm_db_update(int force, alpm_db_t *db)
> CALLOC(fileurl, len, sizeof(char), RET_ERR(handle, PM_ERR_MEMORY, -1));
> snprintf(fileurl, len, "%s/%s.db", server, db->treename);
>
> - ret = _alpm_download(handle, fileurl, syncpath, force, 0, 0);
> + ret = _alpm_download(handle, fileurl, syncpath, NULL, force, 0, 0);
>
> if(ret == 0 && (check_sig == PM_PGP_VERIFY_ALWAYS ||
> check_sig == PM_PGP_VERIFY_OPTIONAL)) {
> @@ -201,7 +201,7 @@ int SYMEXPORT alpm_db_update(int force, alpm_db_t *db)
> /* if we downloaded a DB, we want the .sig from the same server */
> snprintf(fileurl, len, "%s/%s.db.sig", server, db->treename);
>
> - sig_ret = _alpm_download(handle, fileurl, syncpath, 1, 0, errors_ok);
> + sig_ret = _alpm_download(handle, fileurl, syncpath, NULL, 1, 0, errors_ok);
> /* errors_ok suppresses error messages, but not the return code */
> sig_ret = errors_ok ? 0 : sig_ret;
> }
> diff --git a/lib/libalpm/dload.c b/lib/libalpm/dload.c
> index fd83aac..eefa285 100644
> --- a/lib/libalpm/dload.c
> +++ b/lib/libalpm/dload.c
> @@ -148,16 +148,46 @@ static int utimes_long(const char *path, long seconds)
> return 0;
> }
>
> +static size_t parse_headers(void *ptr, size_t size, size_t nmemb, void *user)
> +{
> + size_t realsize = size * nmemb;
> + const char *fptr, *endptr = NULL;
> + const char * const cd_header = "Content-Disposition:";
> + const char * const fn_key = "filename=";
> + struct fileinfo **dlfile = (struct fileinfo**)user;
> +
> + if(strncasecmp(cd_header, ptr, strlen(cd_header)) == 0) {
Locale issues come into play? This string has an 'i'. If cURL
normalizes headers one way or the other we should probably do a direct
compare, but the strncasemp() isn't explicit regarding locale or
anything, so it may just map lower/upper ASCII to the opposite ASCII
and that is it.
> + if((fptr = strstr(ptr, fn_key))) {
> + fptr += strlen(fn_key);
> +
> + /* find the end of the field, which is either a semi-colon, or the end of
> + * the data. As per curl_easy_setopt(3), we cannot count on headers being
> + * null terminated, so we look for the closing \r\n */
> + endptr = fptr + strcspn(fptr, ";\r\n") - 1;
> +
> + /* remove quotes */
> + if(*fptr == '"' && *endptr == '"') {
> + fptr++;
> + endptr--;
> + }
> +
> + STRNDUP((*dlfile)->cd_filename, fptr, endptr - fptr + 1,
> + RET_ERR((*dlfile)->handle, PM_ERR_MEMORY, realsize));
> + }
> + }
>
> -static int curl_download_internal(alpm_handle_t *handle,
> - const char *url, const char *localpath,
> - int force, int allow_resume, int errors_ok)
> + return realsize;
> +}
> +
> +static int curl_download_internal(alpm_handle_t *handle, const char *url,
> + const char *localpath, char **final_file, int force, int allow_resume,
> + int errors_ok)
I believe this hideous function signature gets addressed later, so I
won't worry about it now.
> {
> - int ret = -1;
> + int ret = -1, should_unlink = 0;
> FILE *localf = NULL;
> const char *useragent;
> const char *open_mode = "wb";
> - char *destfile, *tempfile;
> + char *destfile = NULL, *tempfile = NULL, *effective_url;
> /* RFC1123 states applications should support this length */
> char hostname[256];
> char error_buffer[CURL_ERROR_SIZE];
> @@ -170,15 +200,39 @@ static int curl_download_internal(alpm_handle_t *handle,
> dlfile.handle = handle;
> dlfile.initial_size = 0.0;
> dlfile.filename = get_filename(url);
> + dlfile.cd_filename = NULL;
> if(!dlfile.filename || curl_gethost(url, hostname) != 0) {
> _alpm_log(handle, PM_LOG_ERROR, _("url '%s' is invalid\n"), url);
> RET_ERR(handle, PM_ERR_SERVER_BAD_URL, -1);
> }
>
> - destfile = get_fullpath(localpath, dlfile.filename, "");
> - tempfile = get_fullpath(localpath, dlfile.filename, ".part");
> - if(!destfile || !tempfile) {
> - goto cleanup;
> + if(strlen(dlfile.filename) > 0 && strcmp(dlfile.filename, ".sig") != 0) {
> + destfile = get_fullpath(localpath, dlfile.filename, "");
> + tempfile = get_fullpath(localpath, dlfile.filename, ".part");
> + if(!destfile || !tempfile) {
> + goto cleanup;
> + }
> + } else {
> + /* URL isn't to a file and ended with a slash */
> + int fd;
> + char randpath[PATH_MAX];
> +
> + /* we can't support resuming this kind of download, so a partial transfer
> + * will be destroyed */
> + should_unlink = 1;
> +
> + /* create a random filename, which is opened with O_EXCL */
> + snprintf(randpath, PATH_MAX, "%salpmtmp.XXXXXX", localpath);
> + if((fd = mkstemp(randpath)) == -1 || !(localf = fdopen(fd, open_mode))) {
> + unlink(randpath);
> + close(fd);
> + _alpm_log(handle, PM_LOG_ERROR,
> + _("failed to create temporary file for download\n"));
> + goto cleanup;
> + }
> + /* localf now points to our alpmtmp.XXXXXX */
> + STRDUP(tempfile, randpath, RET_ERR(handle, PM_ERR_MEMORY, -1));
> + dlfile.filename = strrchr(randpath, '/') + 1;
> }
>
> error_buffer[0] = '\0';
> @@ -197,6 +251,8 @@ static int curl_download_internal(alpm_handle_t *handle,
> curl_easy_setopt(handle->curl, CURLOPT_PROGRESSDATA, (void *)&dlfile);
> curl_easy_setopt(handle->curl, CURLOPT_LOW_SPEED_LIMIT, 1024L);
> curl_easy_setopt(handle->curl, CURLOPT_LOW_SPEED_TIME, 10L);
> + curl_easy_setopt(handle->curl, CURLOPT_HEADERFUNCTION, parse_headers);
> + curl_easy_setopt(handle->curl, CURLOPT_WRITEHEADER, &dlfile);
>
> useragent = getenv("HTTP_USER_AGENT");
> if(useragent != NULL) {
> @@ -215,9 +271,11 @@ static int curl_download_internal(alpm_handle_t *handle,
> dlfile.initial_size = (double)st.st_size;
> }
>
> - localf = fopen(tempfile, open_mode);
> if(localf == NULL) {
> - goto cleanup;
> + localf = fopen(tempfile, open_mode);
> + if(localf == NULL) {
> + goto cleanup;
> + }
> }
>
> curl_easy_setopt(handle->curl, CURLOPT_WRITEDATA, localf);
> @@ -264,6 +322,7 @@ static int curl_download_internal(alpm_handle_t *handle,
> curl_easy_getinfo(handle->curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &remote_size);
> curl_easy_getinfo(handle->curl, CURLINFO_SIZE_DOWNLOAD, &bytes_dl);
> curl_easy_getinfo(handle->curl, CURLINFO_CONDITION_UNMET, &timecond);
> + curl_easy_getinfo(handle->curl, CURLINFO_EFFECTIVE_URL, &effective_url);
>
> /* time condition was met and we didn't download anything. we need to
> * clean up the 0 byte .part file that's left behind. */
> @@ -284,6 +343,26 @@ static int curl_download_internal(alpm_handle_t *handle,
> goto cleanup;
> }
>
> + if(dlfile.cd_filename) {
> + /* content-disposition header has a better name for our file */
> + free(destfile);
> + destfile = get_fullpath(localpath, dlfile.cd_filename, "");
> + } else {
> + const char *effective_filename = strrchr(effective_url, '/');
> + if(effective_filename) {
> + effective_filename++;
> +
> + /* if destfile was never set, we wrote to a tempfile. even if destfile is
> + * set, we may have followed some redirects and the effective url may
> + * have a better suggestion as to what to name our file. in either case,
> + * refactor destfile to this newly derived name. */
> + if(!destfile || strcmp(effective_filename, strrchr(destfile, '/') + 1) != 0) {
Any chance of strrchr returning NULL here?
> + free(destfile);
> + destfile = get_fullpath(localpath, effective_filename, "");
> + }
> + }
> + }
> +
> ret = 0;
>
> cleanup:
> @@ -294,10 +373,18 @@ cleanup:
>
> if(ret == 0) {
> rename(tempfile, destfile);
> + if(final_file) {
> + *final_file = strdup(strrchr(destfile, '/') + 1);
Same concern as above with strrchr(). STRDUP would be nice too.
> + }
> + }
> +
> + if(dload_interrupted && should_unlink) {
> + unlink(tempfile);
> }
>
> FREE(tempfile);
> FREE(destfile);
> + FREE(dlfile.cd_filename);
>
> /* restore the old signal handlers */
> sigaction(SIGINT, &sig_int[OLD], NULL);
> @@ -316,18 +403,19 @@ cleanup:
> * @param handle the context handle
> * @param url the file's URL
> * @param localpath the directory to save the file in
> + * @param final_file the real name of the downloaded file (may be NULL)
> * @param force force download even if there is an up-to-date local copy
> * @param allow_resume allow a partial download to be resumed
> * @param errors_ok do not log errors (but still return them)
> * @return 0 on success, -1 on error (pm_errno is set accordingly if errors_ok == 0)
> */
> int _alpm_download(alpm_handle_t *handle, const char *url, const char *localpath,
> - int force, int allow_resume, int errors_ok)
> + char **final_file, int force, int allow_resume, int errors_ok)
> {
> if(handle->fetchcb == NULL) {
> #ifdef HAVE_LIBCURL
> - return curl_download_internal(handle, url, localpath,
> - force, allow_resume, errors_ok);
> + return curl_download_internal(handle, url, localpath, final_file, force,
> + allow_resume, errors_ok);
> #else
> RET_ERR(handle, PM_ERR_EXTERNAL_DOWNLOAD, -1);
> #endif
> @@ -344,18 +432,17 @@ int _alpm_download(alpm_handle_t *handle, const char *url, const char *localpath
> char SYMEXPORT *alpm_fetch_pkgurl(alpm_handle_t *handle, const char *url)
> {
> char *filepath;
> - const char *filename, *cachedir;
> + const char *cachedir;
> + char *final_file = NULL;
> int ret;
>
> CHECK_HANDLE(handle, return NULL);
>
> - filename = get_filename(url);
> -
> /* find a valid cache dir to download to */
> cachedir = _alpm_filecache_setup(handle);
>
> /* download the file */
> - ret = _alpm_download(handle, url, cachedir, 0, 1, 0);
> + ret = _alpm_download(handle, url, cachedir, &final_file, 0, 1, 0);
> if(ret == -1) {
> _alpm_log(handle, PM_LOG_WARNING, _("failed to download %s\n"), url);
> return NULL;
> @@ -373,7 +460,7 @@ char SYMEXPORT *alpm_fetch_pkgurl(alpm_handle_t *handle, const char *url)
> CALLOC(sig_url, len, sizeof(char), RET_ERR(handle, PM_ERR_MEMORY, NULL));
> snprintf(sig_url, len, "%s.sig", url);
>
> - ret = _alpm_download(handle, sig_url, cachedir, 1, 0, errors_ok);
> + ret = _alpm_download(handle, sig_url, cachedir, &final_file, 1, 0, errors_ok);
> if(ret == -1 && !errors_ok) {
> _alpm_log(handle, PM_LOG_WARNING, _("failed to download %s\n"), sig_url);
> /* Warn now, but don't return NULL. We will fail later during package
> @@ -385,7 +472,9 @@ char SYMEXPORT *alpm_fetch_pkgurl(alpm_handle_t *handle, const char *url)
> }
>
> /* we should be able to find the file the second time around */
> - filepath = _alpm_filecache_find(handle, filename);
> + filepath = _alpm_filecache_find(handle, final_file);
> + FREE(final_file);
> +
> return filepath;
> }
>
> diff --git a/lib/libalpm/dload.h b/lib/libalpm/dload.h
> index 0cdd900..351b2b3 100644
> --- a/lib/libalpm/dload.h
> +++ b/lib/libalpm/dload.h
> @@ -29,11 +29,12 @@
> struct fileinfo {
> alpm_handle_t *handle;
> const char *filename;
> + char *cd_filename;
> double initial_size;
> };
>
> int _alpm_download(alpm_handle_t *handle, const char *url, const char *localpath,
> - int force, int allow_resume, int errors_ok);
> + char **final_file, int force, int allow_resume, int errors_ok);
>
> #endif /* _ALPM_DLOAD_H */
>
> diff --git a/lib/libalpm/sync.c b/lib/libalpm/sync.c
> index 8f03840..8df8a17 100644
> --- a/lib/libalpm/sync.c
> +++ b/lib/libalpm/sync.c
> @@ -797,7 +797,7 @@ static int download_files(alpm_handle_t *handle, alpm_list_t **deltas)
> CALLOC(fileurl, len, sizeof(char), RET_ERR(handle, PM_ERR_MEMORY, -1));
> snprintf(fileurl, len, "%s/%s", server_url, filename);
>
> - ret = _alpm_download(handle, fileurl, cachedir, 0, 1, 0);
> + ret = _alpm_download(handle, fileurl, cachedir, NULL, 0, 1, 0);
> FREE(fileurl);
> if(ret != -1) {
> break;
> --
> 1.7.6
More information about the pacman-dev
mailing list