[pacman-dev] [PATCH 1/4] dload: handle irregular URLs

Dave Reisner d at falconindy.com
Fri Jul 1 08:59:23 EDT 2011


From: Dave Reisner <d at falconindy.com>

URLs might end with a slash and follow redirects, or could be a
generated by a script such as /getpkg.php?id=12345. In both cases, we
may have a better filename that we can write to, taken from either
content-disposition header, or the effective URL.

Specific to the first case, we write to a temporary file of the format
'alpmtmp.XXXXXX', where XXXXXX is randomized by mkstemp(3). Since this
is a randomly generated file, we cannot support resuming and the file is
unlinked in the event of an interrupt.

We also run into the possibility of changing out the filename from under
alpm on a -U operation, so callers of _alpm_download can optionally pass
a pointer to a *char to be filled in by curl_download_internal with the
actual filename we wrote to. Any sync operation will pass a NULL pointer
here, as we rely on specific names for packages from a mirror.

Fixes FS#22645.

Signed-off-by: Dave Reisner <d at falconindy.com>
---
 lib/libalpm/be_sync.c |    4 +-
 lib/libalpm/dload.c   |  129 +++++++++++++++++++++++++++++++++++++++++--------
 lib/libalpm/dload.h   |    3 +-
 lib/libalpm/sync.c    |    2 +-
 4 files changed, 114 insertions(+), 24 deletions(-)

diff --git a/lib/libalpm/be_sync.c b/lib/libalpm/be_sync.c
index ce418d5..16351f8 100644
--- a/lib/libalpm/be_sync.c
+++ b/lib/libalpm/be_sync.c
@@ -184,7 +184,7 @@ int SYMEXPORT alpm_db_update(int force, alpm_db_t *db)
 		CALLOC(fileurl, len, sizeof(char), RET_ERR(handle, PM_ERR_MEMORY, -1));
 		snprintf(fileurl, len, "%s/%s.db", server, db->treename);
 
-		ret = _alpm_download(handle, fileurl, syncpath, force, 0, 0);
+		ret = _alpm_download(handle, fileurl, syncpath, NULL, force, 0, 0);
 
 		if(ret == 0 && (check_sig == PM_PGP_VERIFY_ALWAYS ||
 					check_sig == PM_PGP_VERIFY_OPTIONAL)) {
@@ -201,7 +201,7 @@ int SYMEXPORT alpm_db_update(int force, alpm_db_t *db)
 			/* if we downloaded a DB, we want the .sig from the same server */
 			snprintf(fileurl, len, "%s/%s.db.sig", server, db->treename);
 
-			sig_ret = _alpm_download(handle, fileurl, syncpath, 1, 0, errors_ok);
+			sig_ret = _alpm_download(handle, fileurl, syncpath, NULL, 1, 0, errors_ok);
 			/* errors_ok suppresses error messages, but not the return code */
 			sig_ret = errors_ok ? 0 : sig_ret;
 		}
diff --git a/lib/libalpm/dload.c b/lib/libalpm/dload.c
index fd83aac..eefa285 100644
--- a/lib/libalpm/dload.c
+++ b/lib/libalpm/dload.c
@@ -148,16 +148,46 @@ static int utimes_long(const char *path, long seconds)
 	return 0;
 }
 
+static size_t parse_headers(void *ptr, size_t size, size_t nmemb, void *user)
+{
+	size_t realsize = size * nmemb;
+	const char *fptr, *endptr = NULL;
+	const char * const cd_header = "Content-Disposition:";
+	const char * const fn_key = "filename=";
+	struct fileinfo **dlfile = (struct fileinfo**)user;
+
+	if(strncasecmp(cd_header, ptr, strlen(cd_header)) == 0) {
+		if((fptr = strstr(ptr, fn_key))) {
+			fptr += strlen(fn_key);
+
+			/* find the end of the field, which is either a semi-colon, or the end of
+			 * the data. As per curl_easy_setopt(3), we cannot count on headers being
+			 * null terminated, so we look for the closing \r\n */
+			endptr = fptr + strcspn(fptr, ";\r\n") - 1;
+
+			/* remove quotes */
+			if(*fptr == '"' && *endptr == '"') {
+				fptr++;
+				endptr--;
+			}
+
+			STRNDUP((*dlfile)->cd_filename, fptr, endptr - fptr + 1,
+					RET_ERR((*dlfile)->handle, PM_ERR_MEMORY, realsize));
+		}
+	}
 
-static int curl_download_internal(alpm_handle_t *handle,
-		const char *url, const char *localpath,
-		int force, int allow_resume, int errors_ok)
+	return realsize;
+}
+
+static int curl_download_internal(alpm_handle_t *handle, const char *url,
+		const char *localpath, char **final_file, int force, int allow_resume,
+		int errors_ok)
 {
-	int ret = -1;
+	int ret = -1, should_unlink = 0;
 	FILE *localf = NULL;
 	const char *useragent;
 	const char *open_mode = "wb";
-	char *destfile, *tempfile;
+	char *destfile = NULL, *tempfile = NULL, *effective_url;
 	/* RFC1123 states applications should support this length */
 	char hostname[256];
 	char error_buffer[CURL_ERROR_SIZE];
@@ -170,15 +200,39 @@ static int curl_download_internal(alpm_handle_t *handle,
 	dlfile.handle = handle;
 	dlfile.initial_size = 0.0;
 	dlfile.filename = get_filename(url);
+	dlfile.cd_filename = NULL;
 	if(!dlfile.filename || curl_gethost(url, hostname) != 0) {
 		_alpm_log(handle, PM_LOG_ERROR, _("url '%s' is invalid\n"), url);
 		RET_ERR(handle, PM_ERR_SERVER_BAD_URL, -1);
 	}
 
-	destfile = get_fullpath(localpath, dlfile.filename, "");
-	tempfile = get_fullpath(localpath, dlfile.filename, ".part");
-	if(!destfile || !tempfile) {
-		goto cleanup;
+	if(strlen(dlfile.filename) > 0 && strcmp(dlfile.filename, ".sig") != 0) {
+		destfile = get_fullpath(localpath, dlfile.filename, "");
+		tempfile = get_fullpath(localpath, dlfile.filename, ".part");
+		if(!destfile || !tempfile) {
+			goto cleanup;
+		}
+	} else {
+		/* URL isn't to a file and ended with a slash */
+		int fd;
+		char randpath[PATH_MAX];
+
+		/* we can't support resuming this kind of download, so a partial transfer
+		 * will be destroyed */
+		should_unlink = 1;
+
+		/* create a random filename, which is opened with O_EXCL */
+		snprintf(randpath, PATH_MAX, "%salpmtmp.XXXXXX", localpath);
+		if((fd = mkstemp(randpath)) == -1 || !(localf = fdopen(fd, open_mode))) {
+			unlink(randpath);
+			close(fd);
+			_alpm_log(handle, PM_LOG_ERROR,
+					_("failed to create temporary file for download\n"));
+			goto cleanup;
+		}
+		/* localf now points to our alpmtmp.XXXXXX */
+		STRDUP(tempfile, randpath, RET_ERR(handle, PM_ERR_MEMORY, -1));
+		dlfile.filename = strrchr(randpath, '/') + 1;
 	}
 
 	error_buffer[0] = '\0';
@@ -197,6 +251,8 @@ static int curl_download_internal(alpm_handle_t *handle,
 	curl_easy_setopt(handle->curl, CURLOPT_PROGRESSDATA, (void *)&dlfile);
 	curl_easy_setopt(handle->curl, CURLOPT_LOW_SPEED_LIMIT, 1024L);
 	curl_easy_setopt(handle->curl, CURLOPT_LOW_SPEED_TIME, 10L);
+	curl_easy_setopt(handle->curl, CURLOPT_HEADERFUNCTION, parse_headers);
+	curl_easy_setopt(handle->curl, CURLOPT_WRITEHEADER, &dlfile);
 
 	useragent = getenv("HTTP_USER_AGENT");
 	if(useragent != NULL) {
@@ -215,9 +271,11 @@ static int curl_download_internal(alpm_handle_t *handle,
 		dlfile.initial_size = (double)st.st_size;
 	}
 
-	localf = fopen(tempfile, open_mode);
 	if(localf == NULL) {
-		goto cleanup;
+		localf = fopen(tempfile, open_mode);
+		if(localf == NULL) {
+			goto cleanup;
+		}
 	}
 
 	curl_easy_setopt(handle->curl, CURLOPT_WRITEDATA, localf);
@@ -264,6 +322,7 @@ static int curl_download_internal(alpm_handle_t *handle,
 	curl_easy_getinfo(handle->curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &remote_size);
 	curl_easy_getinfo(handle->curl, CURLINFO_SIZE_DOWNLOAD, &bytes_dl);
 	curl_easy_getinfo(handle->curl, CURLINFO_CONDITION_UNMET, &timecond);
+	curl_easy_getinfo(handle->curl, CURLINFO_EFFECTIVE_URL, &effective_url);
 
 	/* time condition was met and we didn't download anything. we need to
 	 * clean up the 0 byte .part file that's left behind. */
@@ -284,6 +343,26 @@ static int curl_download_internal(alpm_handle_t *handle,
 		goto cleanup;
 	}
 
+	if(dlfile.cd_filename) {
+		/* content-disposition header has a better name for our file */
+		free(destfile);
+		destfile = get_fullpath(localpath, dlfile.cd_filename, "");
+	} else {
+		const char *effective_filename = strrchr(effective_url, '/');
+		if(effective_filename) {
+			effective_filename++;
+
+			/* if destfile was never set, we wrote to a tempfile. even if destfile is
+			 * set, we may have followed some redirects and the effective url may
+			 * have a better suggestion as to what to name our file. in either case,
+			 * refactor destfile to this newly derived name. */
+			if(!destfile || strcmp(effective_filename, strrchr(destfile, '/') + 1) != 0) {
+				free(destfile);
+				destfile = get_fullpath(localpath, effective_filename, "");
+			}
+		}
+	}
+
 	ret = 0;
 
 cleanup:
@@ -294,10 +373,18 @@ cleanup:
 
 	if(ret == 0) {
 		rename(tempfile, destfile);
+		if(final_file) {
+			*final_file = strdup(strrchr(destfile, '/') + 1);
+		}
+	}
+
+	if(dload_interrupted && should_unlink) {
+		unlink(tempfile);
 	}
 
 	FREE(tempfile);
 	FREE(destfile);
+	FREE(dlfile.cd_filename);
 
 	/* restore the old signal handlers */
 	sigaction(SIGINT, &sig_int[OLD], NULL);
@@ -316,18 +403,19 @@ cleanup:
  * @param handle the context handle
  * @param url the file's URL
  * @param localpath the directory to save the file in
+ * @param final_file the real name of the downloaded file (may be NULL)
  * @param force force download even if there is an up-to-date local copy
  * @param allow_resume allow a partial download to be resumed
  * @param errors_ok do not log errors (but still return them)
  * @return 0 on success, -1 on error (pm_errno is set accordingly if errors_ok == 0)
  */
 int _alpm_download(alpm_handle_t *handle, const char *url, const char *localpath,
-		int force, int allow_resume, int errors_ok)
+		char **final_file, int force, int allow_resume, int errors_ok)
 {
 	if(handle->fetchcb == NULL) {
 #ifdef HAVE_LIBCURL
-		return curl_download_internal(handle, url, localpath,
-				force, allow_resume, errors_ok);
+		return curl_download_internal(handle, url, localpath, final_file, force,
+				allow_resume, errors_ok);
 #else
 		RET_ERR(handle, PM_ERR_EXTERNAL_DOWNLOAD, -1);
 #endif
@@ -344,18 +432,17 @@ int _alpm_download(alpm_handle_t *handle, const char *url, const char *localpath
 char SYMEXPORT *alpm_fetch_pkgurl(alpm_handle_t *handle, const char *url)
 {
 	char *filepath;
-	const char *filename, *cachedir;
+	const char *cachedir;
+	char *final_file = NULL;
 	int ret;
 
 	CHECK_HANDLE(handle, return NULL);
 
-	filename = get_filename(url);
-
 	/* find a valid cache dir to download to */
 	cachedir = _alpm_filecache_setup(handle);
 
 	/* download the file */
-	ret = _alpm_download(handle, url, cachedir, 0, 1, 0);
+	ret = _alpm_download(handle, url, cachedir, &final_file, 0, 1, 0);
 	if(ret == -1) {
 		_alpm_log(handle, PM_LOG_WARNING, _("failed to download %s\n"), url);
 		return NULL;
@@ -373,7 +460,7 @@ char SYMEXPORT *alpm_fetch_pkgurl(alpm_handle_t *handle, const char *url)
 		CALLOC(sig_url, len, sizeof(char), RET_ERR(handle, PM_ERR_MEMORY, NULL));
 		snprintf(sig_url, len, "%s.sig", url);
 
-		ret = _alpm_download(handle, sig_url, cachedir, 1, 0, errors_ok);
+		ret = _alpm_download(handle, sig_url, cachedir, &final_file, 1, 0, errors_ok);
 		if(ret == -1 && !errors_ok) {
 			_alpm_log(handle, PM_LOG_WARNING, _("failed to download %s\n"), sig_url);
 			/* Warn now, but don't return NULL. We will fail later during package
@@ -385,7 +472,9 @@ char SYMEXPORT *alpm_fetch_pkgurl(alpm_handle_t *handle, const char *url)
 	}
 
 	/* we should be able to find the file the second time around */
-	filepath = _alpm_filecache_find(handle, filename);
+	filepath = _alpm_filecache_find(handle, final_file);
+	FREE(final_file);
+
 	return filepath;
 }
 
diff --git a/lib/libalpm/dload.h b/lib/libalpm/dload.h
index 0cdd900..351b2b3 100644
--- a/lib/libalpm/dload.h
+++ b/lib/libalpm/dload.h
@@ -29,11 +29,12 @@
 struct fileinfo {
 	alpm_handle_t *handle;
 	const char *filename;
+	char *cd_filename;
 	double initial_size;
 };
 
 int _alpm_download(alpm_handle_t *handle, const char *url, const char *localpath,
-		int force, int allow_resume, int errors_ok);
+		char **final_file, int force, int allow_resume, int errors_ok);
 
 #endif /* _ALPM_DLOAD_H */
 
diff --git a/lib/libalpm/sync.c b/lib/libalpm/sync.c
index 8f03840..8df8a17 100644
--- a/lib/libalpm/sync.c
+++ b/lib/libalpm/sync.c
@@ -797,7 +797,7 @@ static int download_files(alpm_handle_t *handle, alpm_list_t **deltas)
 					CALLOC(fileurl, len, sizeof(char), RET_ERR(handle, PM_ERR_MEMORY, -1));
 					snprintf(fileurl, len, "%s/%s", server_url, filename);
 
-					ret = _alpm_download(handle, fileurl, cachedir, 0, 1, 0);
+					ret = _alpm_download(handle, fileurl, cachedir, NULL, 0, 1, 0);
 					FREE(fileurl);
 					if(ret != -1) {
 						break;
-- 
1.7.6



More information about the pacman-dev mailing list