[devscripts] 01/01: licensecheck: filter scanned files by mime type

dod at debian.org dod at debian.org
Tue Aug 18 11:26:33 UTC 2015


This is an automated email from the git hooks/post-receive script.

dod pushed a commit to branch master
in repository devscripts.

commit 3da4f5f9fb4fe95dee1f0b25a0614b69cb365c21
Author: Dominique Dumont <dod at debian.org>
Date:   Fri Jul 24 14:00:00 2015 +0200

    licensecheck: filter scanned files by mime type
    
    Currently, licensecheck -r uses find to scan a directory and accepts files
    based on their suffix (i.e. accepts .c .h .cxx ...)
    
    This list of suffixes is a big regexp that must be updated regularly. Still
    some files are missed like config.guess.
    
    Maintaining this regexp is not efficient.
    
    With this commit, licensecheck use 'file' command to decide whether
    to scan a file or not. All files of mime type 'text/*' and
    'application/xml' are scanned.
    
    Note that file is already used to find the charset of each scanned file,
    so there's no performance impact.
---
 debian/changelog        |  4 ++++
 scripts/licensecheck.pl | 27 ++++++++++++++++++---------
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/debian/changelog b/debian/changelog
index 0ae137d..faa87a3 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -17,6 +17,10 @@ devscripts (2.15.9) UNRELEASED; urgency=medium
       #795470)
     + Avoid querying the BTS when there are no bugs closed in the changelog.
 
+  [ Dominique Dumont ]
+   * licensecheck: use 'file' command to decide whether to scan a file or
+     not (instead of testing file suffix)
+
  -- Joachim Breitner <nomeata at debian.org>  Tue, 11 Aug 2015 21:12:03 +0200
 
 devscripts (2.15.8) unstable; urgency=high
diff --git a/scripts/licensecheck.pl b/scripts/licensecheck.pl
index 78d9fd7..fd84db5 100755
--- a/scripts/licensecheck.pl
+++ b/scripts/licensecheck.pl
@@ -74,7 +74,8 @@ recursively.
 Specify a pattern against which filenames will be matched in order to
 decide which files to check the license of.
 
-The default includes common source files.
+By default, all files of mime type C<text/*> and C<application/xml>
+are parsed. The mime type is given by C<file> command.
 
 =item B<--copyright>
 
@@ -131,6 +132,16 @@ General Public License, version 2 or later.
 
 Adam D. Barratt <adam at adam-barratt.org.uk>
 
+=head1 SEE ALSO
+
+=over
+
+=item *
+
+L<file>
+
+=back
+
 =cut
 
 # see http://stackoverflow.com/questions/6162484/why-does-modern-perl-avoid-utf-8-by-default/6163129#6163129
@@ -169,9 +180,6 @@ my $default_ignore_regex = qr!
 \.shelf|_MTN|\.bzr(?:\.backup|tags)?)(?:$|/.*$)
 !x;
 
-my $default_check_regex = '\.(c(c|pp|xx)?|h(h|pp|xx)?|S|f(77|90)?|go|groovy|scala|clj|p(l|m)|xs|sh|php|py(|x)|rb|java|js|vala|el|sc(i|e)|cs|pas|inc|dtd|xsl|mod|m|tex|mli?|(c|l)?hs)$';
-
-
 # also used to cleanup
 my $copyright_indicator_regex
     = qr!
@@ -273,8 +281,9 @@ GetOptions(\%OPT,
 
 $OPT{'lines'} = $def_lines if $OPT{'lines'} !~ /^[1-9][0-9]*$/;
 my $ignore_regex = length($OPT{ignore}) ? qr/$OPT{ignore}/ : $default_ignore_regex;
-$OPT{'check'} = $default_check_regex if ! length $OPT{'check'};
-my $check_regex = qr/$OPT{check}/;
+
+my $check_regex ;
+$check_regex = qr/$OPT{check}/ if length $OPT{check};
 
 if ($OPT{'noconf'}) {
     fatal("--no-conf is only acceptable as the first command-line option!");
@@ -302,14 +311,14 @@ while (@ARGV) {
 
 	while (my $found = <$FIND>) {
 	    chomp ($found);
-	    next unless $found =~ $check_regex;
+	    next if ( $check_regex and $found !~ $check_regex );
 	    # Skip empty files
 	    next if (-z $found);
 	    push @files, $found unless $found =~ $ignore_regex;
 	}
 	close $FIND;
     } else {
-	next unless ($files_count == 1) or $file =~ $check_regex;
+	next unless ($files_count == 1) or ( $check_regex and $file =~ $check_regex);
 	push @files, $file unless $file =~ $ignore_regex;
     }
 }
@@ -466,7 +475,7 @@ Valid options are:
                             (Default: $def_lines)
    --check, -c            Specify a pattern indicating which files should
                              be checked
-                             (Default: '$default_check_regex')
+                             (Default: All text and xml files)
    --machine, -m          Display in a machine readable way (good for awk)
    --recursive, -r        Add the contents of directories recursively
    --copyright            Also display the file's copyright

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/collab-maint/devscripts.git



More information about the devscripts-devel mailing list