[Simple-cdd-devel] Bug#861198: Shutting down public FTP services

Vagrant Cascadian vagrant at debian.org
Thu Apr 27 15:26:13 UTC 2017


Any review, refactoring, etc. would be great!

I want to keep an eye to a patch that would be acceptible to the release
team for stretch at this point, so maybe only refactoring this new code
for a smaller, more readable diff would be best.


On 2017-04-25, Enrico Zini wrote:
> simple-cdd currently depends on ftp to be able to get files in
> http://ftp.de.debian.org/debian/doc/ that are needed by debian-cd. It
> needs ftp because with http there is no reliable way to enumerate the
> list of files in doc/ to be downloaded. 

Solved by the earlier patch, and pushed that to git.


> That is the only reason there's still a need of the mirror_wget tool
> and all the download can't just be done via python's http client
> libraries.

Below is a crude implementation using python3-urllib3. Maybe it's good
enough.

I'm not entirely sure the timestamp checking is effective; it may be
downloading the whole file on the header request. Particularly terrified
if the date string handling and timezones madness are good enough to
reasonably handle debian mirrors in the wild...

It also supports using a proxy, although it seems to require a bit of
code duplication. More elegant support for proxies would be nice.

Ironically, there are no remaining calls to wget in mirror_wget.py.
Probably should deprecate and/or remove wget_debian_mirror and other
variables that reference wget...


diff --git a/simple_cdd/tools/mirror_wget.py b/simple_cdd/tools/mirror_wget.py
index 5ccbbca..06e6537 100644
--- a/simple_cdd/tools/mirror_wget.py
+++ b/simple_cdd/tools/mirror_wget.py
@@ -6,6 +6,9 @@ from urllib.parse import urlparse, urljoin
 import os
 import re
 import logging
+import urllib3
+import time
+from dateutil import parser
 
 log = logging.getLogger()
 
@@ -35,25 +38,43 @@ class ToolMirrorWget(Tool):
             baseurl = env.get("wget_debian_mirror")
             path_depth = urlparse(baseurl).path.strip("/").count("/") + 1
 
-            def _wget_one(url, output):
+            def _download(url, output):
                 if not os.path.isdir(os.path.dirname(output)):
                     os.makedirs(os.path.dirname(output))
-                args = ["wget", "--output-document="+output, "--timestamping", url]
-                retval = run_command("wget {}".format(url), args, logfd=logfd, env=wget_env)
-                if retval != 0:
-                    raise Fail("wget exited with code %s, see %s for full output log", retval, logfilename)
 
-            # Build the environment for running reprepro
-            wget_env = {}
-            for name, val, changed in self.env.export_iter():
-                wget_env[name] = str(val)
+                r = http.request('GET', url)
+                url_modified_string = r.getheader('Last-Modified')
+                url_modified = parser.parse(url_modified_string)
+                log.info(url + 'Last Modified' + url_modified_string)
+                if os.path.exists(output):
+                    file_modified = os.path.getmtime(output)
+                    log.info(output + 'Last Modified' + time.ctime(file_modified))
+                    if file_modified > url_modified.timestamp():
+                        log.info('newer file %s than url %s, skipping download', output, url)
+                        return
+                y = open(output, 'wb')
+                y.write(r.data)
+                y.close()
+                os.utime(output, times=(time.time(), url_modified.timestamp()))
+
+            if env.get("http_proxy"):
+                http = urllib3.ProxyManager(
+                    env.get("http_proxy"),
+                    cert_reqs='CERT_REQUIRED',
+                    ca_certs='/etc/ssl/certs/ca-certificates.crt'
+                )
+            else:
+                http = urllib3.PoolManager(
+                    cert_reqs='CERT_REQUIRED',
+                    ca_certs='/etc/ssl/certs/ca-certificates.crt'
+                )
 
             if env.get("mirror_files"):
                 # Download the checksums present in the archive "extrafiles" and verify
                 extrafiles_file_inlinesig = os.path.join(env.get("MIRROR"), "extrafiles")
                 extrafiles_file= os.path.join(env.get("simple_cdd_temp"), "extrafiles.unsigned")
                 download_extrafiles_file = os.path.join(env.get("wget_debian_mirror"), "extrafiles")
-                _wget_one(download_extrafiles_file, extrafiles_file_inlinesig)
+                _download(download_extrafiles_file, extrafiles_file_inlinesig)
                 self.gnupg.verify_inline_sig(extrafiles_file_inlinesig)
                 self.gnupg.extract_inline_contents(extrafiles_file, extrafiles_file_inlinesig)
 
@@ -81,7 +102,7 @@ class ToolMirrorWget(Tool):
                         })
 
                 for x in ef_files:
-                    _wget_one(x["url"], x["absname"])
+                    _download(x["url"], x["absname"])
                     extrafile_sums.verify_file(x["absname"], x["relname"])
 
 
@@ -94,14 +115,14 @@ class ToolMirrorWget(Tool):
             for x in files:
                 p = os.path.join(env.get("MIRROR"), x)
                 d = os.path.join(env.get("wget_debian_mirror"), x)
-                _wget_one(d, p)
+                _download(d, p)
 
             if checksum_files:
                 # Get the release file and verify that it is valid
                 release_file = os.path.join(env.get("simple_cdd_temp"), env.format("{DI_CODENAME}_Release"))
                 download_release_file = os.path.join(env.get("wget_debian_mirror"), "dists", env.get("DI_CODENAME"), "Release")
-                _wget_one(download_release_file, release_file)
-                _wget_one(download_release_file + ".gpg", release_file + ".gpg")
+                _download(download_release_file, release_file)
+                _download(download_release_file + ".gpg", release_file + ".gpg")
                 self.gnupg.verify_detached_sig(release_file, release_file + ".gpg")
 
                 # Parse the release file for checksums
@@ -147,7 +168,7 @@ class ToolMirrorWget(Tool):
                     file_sums.parse_checksums_file(absname, hashtype)
                     for f in extra_files:
                         # Download the extra files
-                        _wget_one(f["url"], f["absname"])
+                        _download(f["url"], f["absname"])
                         file_sums.verify_file(f["absname"], f["relname"])
 
 


live well,
  vagrant
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 832 bytes
Desc: not available
URL: <http://lists.alioth.debian.org/pipermail/simple-cdd-devel/attachments/20170427/22c24871/attachment.sig>


More information about the Simple-cdd-devel mailing list