[pacman-dev] [PATCH] dload: handle irregular URLs

Dave Reisner d at falconindy.com
Sat Jun 11 15:16:42 EDT 2011


URLs might end with a slash and follow redirects, or could be a
generated by a script such as /getpkg.php?id=12345. In both cases, we
may have a better filename that we can write to, taken from either
content-disposition header, or the effective URL.

Specific to the first case, we write to a temporary file of the format
'alpmtmp.XXXXXX', where XXXXXX is randomized by mkstemp(3). Since this
is a randomly generated file, we cannot support resuming and the file is
unlinked in the event of an interrupt.

We also run into the possibility of changing out the filename from under
alpm on a -U operation, so callers of _alpm_download can optionally pass
a pointer to a *char to be filled in by curl_download_internal with the
actual filename we wrote to. Any sync operation will pass a NULL pointer
here, as we rely on specific names for packages from a mirror.

Fixes FS#22645.

Signed-off-by: Dave Reisner <d at falconindy.com>
---
There's one hack in here that I'm not happy with, so feedback is welcome.
Basically, the signature file is guaranteed to have a length, because we
simply append '.sig' to the URL. In the case of a URL ending with a /, this
results in downloading a file called '.sig'. This patchwork hardcodes the
edge case and forces the use of a temporary file. It's not ideal, but it
works.

Any other comments are also, of course, welcome.

d


 lib/libalpm/be_sync.c |    4 +-
 lib/libalpm/dload.c   |  120 ++++++++++++++++++++++++++++++++++++++++++-------
 lib/libalpm/dload.h   |    2 +-
 lib/libalpm/sync.c    |    2 +-
 4 files changed, 107 insertions(+), 21 deletions(-)

diff --git a/lib/libalpm/be_sync.c b/lib/libalpm/be_sync.c
index a75cbda..c12fff1 100644
--- a/lib/libalpm/be_sync.c
+++ b/lib/libalpm/be_sync.c
@@ -127,7 +127,7 @@ int SYMEXPORT alpm_db_update(int force, pmdb_t *db)
 		CALLOC(fileurl, len, sizeof(char), RET_ERR(PM_ERR_MEMORY, -1));
 		snprintf(fileurl, len, "%s/%s.db", server, db->treename);
 
-		ret = _alpm_download(fileurl, syncpath, force, 0, 0);
+		ret = _alpm_download(fileurl, syncpath, NULL, force, 0, 0);
 
 		if(ret == 0 && (check_sig == PM_PGP_VERIFY_ALWAYS ||
 					check_sig == PM_PGP_VERIFY_OPTIONAL)) {
@@ -135,7 +135,7 @@ int SYMEXPORT alpm_db_update(int force, pmdb_t *db)
 			/* if we downloaded a DB, we want the .sig from the same server */
 			snprintf(fileurl, len, "%s/%s.db.sig", server, db->treename);
 
-			sig_ret = _alpm_download(fileurl, syncpath, 1, 0, errors_ok);
+			sig_ret = _alpm_download(fileurl, syncpath, NULL, 1, 0, errors_ok);
 			/* errors_ok suppresses error messages, but not the return code */
 			sig_ret = errors_ok ? 0 : sig_ret;
 		}
diff --git a/lib/libalpm/dload.c b/lib/libalpm/dload.c
index 0ba3bc1..b37ca02 100644
--- a/lib/libalpm/dload.c
+++ b/lib/libalpm/dload.c
@@ -152,15 +152,45 @@ static int utimes_long(const char *path, long time)
 	return 0;
 }
 
+static size_t parse_headers(void *ptr, size_t size, size_t nmemb, void *user)
+{
+	size_t realsize = size * nmemb;
+	const char *fptr, *endptr = NULL;
+	const char * const cd_header = "Content-Disposition:";
+	const char * const fn_key = "filename=";
+	char **cd_filename = (char**)user;
+
+	if(strncasecmp(cd_header, ptr, strlen(cd_header)) == 0) {
+		if((fptr = strstr(ptr, fn_key))) {
+			fptr += strlen(fn_key);
+
+			/* find the end of the field, which is either a semi-colon, or the end of
+			 * the data. As per curl_easy_setopt(3), we cannot count on headers being
+			 * null terminated, so we look for the closing \r\n */
+			endptr = fptr + strcspn(fptr, ";\r\n") - 1;
+
+			/* remove quotes */
+			if(*fptr == '"' && *endptr == '"') {
+				fptr++;
+				endptr--;
+			}
+
+			STRNDUP(*cd_filename, fptr, endptr - fptr + 1, RET_ERR(PM_ERR_MEMORY, realsize));
+		}
+	}
+
+	return realsize;
+}
 
 static int curl_download_internal(const char *url, const char *localpath,
-		int force, int allow_resume, int errors_ok)
+		char **final_file, int force, int allow_resume, int errors_ok)
 {
-	int ret = -1;
+	int ret = -1, should_unlink = 0;
 	FILE *localf = NULL;
 	const char *useragent;
 	const char *open_mode = "wb";
-	char *destfile, *tempfile;
+	char *destfile = NULL, *tempfile = NULL, *effective_url;
+	char *cd_filename = NULL; /* filename from content-disposition header */
 	/* RFC1123 states applications should support this length */
 	char hostname[256];
 	char error_buffer[CURL_ERROR_SIZE];
@@ -177,10 +207,31 @@ static int curl_download_internal(const char *url, const char *localpath,
 		RET_ERR(PM_ERR_SERVER_BAD_URL, -1);
 	}
 
-	destfile = get_fullpath(localpath, dlfile.filename, "");
-	tempfile = get_fullpath(localpath, dlfile.filename, ".part");
-	if(!destfile || !tempfile) {
-		goto cleanup;
+	if(strlen(dlfile.filename) > 0 && strcmp(dlfile.filename, ".sig") != 0) {
+		destfile = get_fullpath(localpath, dlfile.filename, "");
+		tempfile = get_fullpath(localpath, dlfile.filename, ".part");
+		if(!destfile || !tempfile) {
+			goto cleanup;
+		}
+	} else { /* URL isn't to a file and ended with a slash */
+		int fd;
+		char randpath[PATH_MAX];
+
+		/* we can't support resuming this kind of download, so a partial transfer
+		 * will be destroyed */
+		should_unlink = 1;
+
+		/* create a random filename, which is opened with O_EXCL */
+		snprintf(randpath, PATH_MAX, "%salpmtmp.XXXXXX", localpath);
+		if((fd = mkstemp(randpath)) == -1 || !(localf = fdopen(fd, "w+"))) {
+			unlink(randpath);
+			close(fd);
+			_alpm_log(PM_LOG_ERROR, _("failed to create temporary file for download\n"));
+			goto cleanup;
+		}
+		/* localf now points to our alpmtmp.XXXXXX */
+		STRDUP(tempfile, randpath, RET_ERR(PM_ERR_MEMORY, -1));
+		dlfile.filename = strrchr(randpath, '/') + 1;
 	}
 
 	error_buffer[0] = '\0';
@@ -199,6 +250,8 @@ static int curl_download_internal(const char *url, const char *localpath,
 	curl_easy_setopt(handle->curl, CURLOPT_PROGRESSDATA, (void *)&dlfile);
 	curl_easy_setopt(handle->curl, CURLOPT_LOW_SPEED_LIMIT, 1024L);
 	curl_easy_setopt(handle->curl, CURLOPT_LOW_SPEED_TIME, 10L);
+	curl_easy_setopt(handle->curl, CURLOPT_HEADERFUNCTION, parse_headers);
+	curl_easy_setopt(handle->curl, CURLOPT_WRITEHEADER, &cd_filename);
 
 	useragent = getenv("HTTP_USER_AGENT");
 	if(useragent != NULL) {
@@ -217,9 +270,11 @@ static int curl_download_internal(const char *url, const char *localpath,
 		dlfile.initial_size = (double)st.st_size;
 	}
 
-	localf = fopen(tempfile, open_mode);
 	if(localf == NULL) {
-		goto cleanup;
+		localf = fopen(tempfile, open_mode);
+		if(localf == NULL) {
+			goto cleanup;
+		}
 	}
 
 	curl_easy_setopt(handle->curl, CURLOPT_WRITEDATA, localf);
@@ -266,6 +321,7 @@ static int curl_download_internal(const char *url, const char *localpath,
 	curl_easy_getinfo(handle->curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &remote_size);
 	curl_easy_getinfo(handle->curl, CURLINFO_SIZE_DOWNLOAD, &bytes_dl);
 	curl_easy_getinfo(handle->curl, CURLINFO_CONDITION_UNMET, &timecond);
+	curl_easy_getinfo(handle->curl, CURLINFO_EFFECTIVE_URL, &effective_url);
 
 	/* time condition was met and we didn't download anything. we need to
 	 * clean up the 0 byte .part file that's left behind. */
@@ -286,6 +342,26 @@ static int curl_download_internal(const char *url, const char *localpath,
 		goto cleanup;
 	}
 
+	if(cd_filename) {
+		/* content-disposition header has a better name for our file */
+		free(destfile);
+		destfile = get_fullpath(localpath, cd_filename, "");
+	} else {
+		const char *effective_filename = strrchr(effective_url, '/');
+		if(effective_filename) {
+			effective_filename++;
+
+			/* if destfile was never set, we wrote to a tempfile. even if destfile is
+			 * set, we may have followed some redirects and the effective url may
+			 * have a better suggestion as to what to name our file. in either case,
+			 * refactor destfile to this newly derived name. */
+			if(!destfile || strcmp(effective_filename, strrchr(destfile, '/') + 1) != 0) {
+				free(destfile);
+				destfile = get_fullpath(localpath, effective_filename, "");
+			}
+		}
+	}
+
 	ret = 0;
 
 cleanup:
@@ -296,10 +372,18 @@ cleanup:
 
 	if(ret == 0) {
 		rename(tempfile, destfile);
+		if(final_file) {
+			*final_file = strdup(strrchr(destfile, '/') + 1);
+		}
+	}
+
+	if(dload_interrupted && should_unlink) {
+		unlink(tempfile);
 	}
 
 	FREE(tempfile);
 	FREE(destfile);
+	FREE(cd_filename);
 
 	/* restore the old signal handlers */
 	sigaction(SIGINT, &sig_int[OLD], NULL);
@@ -313,12 +397,13 @@ cleanup:
 }
 #endif
 
-int _alpm_download(const char *url, const char *localpath,
+int _alpm_download(const char *url, const char *localpath, char **final_file,
 		int force, int allow_resume, int errors_ok)
 {
 	if(handle->fetchcb == NULL) {
 #ifdef HAVE_LIBCURL
-		return curl_download_internal(url, localpath, force, allow_resume, errors_ok);
+		return curl_download_internal(url, localpath, final_file, force, allow_resume,
+				errors_ok);
 #else
 		RET_ERR(PM_ERR_EXTERNAL_DOWNLOAD, -1);
 #endif
@@ -335,16 +420,15 @@ int _alpm_download(const char *url, const char *localpath,
 char SYMEXPORT *alpm_fetch_pkgurl(pmhandle_t *handle, const char *url)
 {
 	char *filepath;
-	const char *filename, *cachedir;
+	const char *cachedir;
+	char *final_file = NULL;
 	int ret;
 
-	filename = get_filename(url);
-
 	/* find a valid cache dir to download to */
 	cachedir = _alpm_filecache_setup(handle);
 
 	/* download the file */
-	ret = _alpm_download(url, cachedir, 0, 1, 0);
+	ret = _alpm_download(url, cachedir, &final_file, 0, 1, 0);
 	if(ret == -1) {
 		_alpm_log(PM_LOG_WARNING, _("failed to download %s\n"), url);
 		return NULL;
@@ -362,7 +446,7 @@ char SYMEXPORT *alpm_fetch_pkgurl(pmhandle_t *handle, const char *url)
 		CALLOC(sig_url, len, sizeof(char), RET_ERR(PM_ERR_MEMORY, NULL));
 		snprintf(sig_url, len, "%s.sig", url);
 
-		ret = _alpm_download(sig_url, cachedir, 1, 0, errors_ok);
+		ret = _alpm_download(sig_url, cachedir, &final_file, 1, 0, errors_ok);
 		if(ret == -1 && !errors_ok) {
 			_alpm_log(PM_LOG_WARNING, _("failed to download %s\n"), sig_url);
 			/* Warn now, but don't return NULL. We will fail later during package
@@ -374,7 +458,9 @@ char SYMEXPORT *alpm_fetch_pkgurl(pmhandle_t *handle, const char *url)
 	}
 
 	/* we should be able to find the file the second time around */
-	filepath = _alpm_filecache_find(handle, filename);
+	filepath = _alpm_filecache_find(handle, final_file);
+	FREE(final_file);
+
 	return filepath;
 }
 
diff --git a/lib/libalpm/dload.h b/lib/libalpm/dload.h
index f4fd14c..c5d05b0 100644
--- a/lib/libalpm/dload.h
+++ b/lib/libalpm/dload.h
@@ -32,7 +32,7 @@ struct fileinfo {
 };
 
 int _alpm_download(const char *url, const char *localpath,
-		int force, int allow_resume, int errors_ok);
+		char **final_file, int force, int allow_resume, int errors_ok);
 
 #endif /* _ALPM_DLOAD_H */
 
diff --git a/lib/libalpm/sync.c b/lib/libalpm/sync.c
index 16be6d9..12a8c90 100644
--- a/lib/libalpm/sync.c
+++ b/lib/libalpm/sync.c
@@ -794,7 +794,7 @@ static int download_files(pmhandle_t *handle, alpm_list_t **deltas)
 					CALLOC(fileurl, len, sizeof(char), RET_ERR(PM_ERR_MEMORY, -1));
 					snprintf(fileurl, len, "%s/%s", server_url, filename);
 
-					ret = _alpm_download(fileurl, cachedir, 0, 1, 0);
+					ret = _alpm_download(fileurl, cachedir, NULL, 0, 1, 0);
 					FREE(fileurl);
 					if(ret != -1) {
 						break;
-- 
1.7.5.4



More information about the pacman-dev mailing list