[Dehs-devel] r196 - trunk

Raphael Geissert geissert at alioth.debian.org
Tue Jul 5 08:16:49 UTC 2011


Author: geissert
Date: 2011-07-05 08:16:49 +0000 (Tue, 05 Jul 2011)
New Revision: 196

Modified:
   trunk/dehs_pg.php
Log:
Rewrite the Sources parser with a proper one
Additionally rewrite that section to use prepared statements



Modified: trunk/dehs_pg.php
===================================================================
--- trunk/dehs_pg.php	2011-07-05 00:24:37 UTC (rev 195)
+++ trunk/dehs_pg.php	2011-07-05 08:16:49 UTC (rev 196)
@@ -4,7 +4,7 @@
 
 Originally written by Stefano Fabri <bluefuture at nospam@email.it>
 Copyright 2004, Stefano Fabri
-Copyright 2007, 2008 by Raphael Geissert <atomo64 at gmail.com>
+Copyright 2007, 2008, 2011 by Raphael Geissert <atomo64 at gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -66,6 +66,25 @@
     }
 }
 
+function file_with_wrapper($file) {
+    $wrapper = '';
+    if (preg_match('/\.(bz2|gz)$/D', $file, $matches)) {
+	$wrapper = 'compress.';
+	switch ($matches[1]) {
+	    case 'bz2':
+		$wrapper .= 'bzip2';
+		break;
+	    case 'gz':
+		$wrapper .= 'zlib';
+		break;
+	    default:
+		die('FIXME: '.__FILE__.':'.__LINE__);
+	}
+	$wrapper .= '://';
+    }
+    return $wrapper.$file;
+}
+
 function ext_watch ($file,$pkg,$version) {
     if (!file_exists($file)) return "";
 
@@ -159,138 +178,186 @@
     print "Download Diff.gz of package $pkg => Fallito\n";
     return false;
 }
-function db_add($initial='') {
-    global $dirs,$dists,$dbconn;
+
+function parse_sources($fh, $callback) {
+    $group = null;
+    $field = '';
+    $args = func_get_args();
+    unset($args[0]);
+    unset($args[1]);
+    // reindex the array:
+    $args = array_values($args);
+
+    while (!feof($fh)) {
+	if (!($line = fgets($fh)))
+	    die_status ('fgets failed at '.ftell($fh));
+
+	$line = trim($line);
+
+	if (!strlen($line)) {
+	    if ($group)
+		$callback($group, $args);
+	    $group = null;
+	}
+	else if (preg_match('/^(\S+):\s*(.*?)\s*$/', $line, $matches)) {
+	    if (!$group) {
+		$group = array();
+	    }
+	    $field = strtolower($matches[1]);
+	    $value = $matches[2];
+	    $group[$field] = $value;
+	} else if (preg_match('/^\s+(.*?)\s*$/', $line, $matches)) {
+	    if (!$group) {
+		print STDERR 'syntax error: found lone field continuation at '.ftell($fh)."\n";
+		continue;
+	    }
+	    $group[$field] .= "\n" . $matches[1];
+	} else {
+	    print STDERR 'syntax error: found lone data at '.ftell($fh)."\n";
+	}
+    }
+
+    if ($group) {
+	print STDERR 'syntax error: unterminated group, EOF reached'."\n";
+	$callback($group, $args);
+    }
+}
+
+function db_add_callback($group, $params) {
+    static $pkgs = array();
+
+    if ($group == null && $params == null) {
+	$old_pkgs = $pkgs;
+	$pkgs = array();
+	return $old_pkgs;
+    }
+
+    if (!isset($pkgs[$group['package']]) || is_updated($pkgs[$group['package']]['version'],$group['version'],true)) {
+	$group['section'] = $params[0];
+	$pkgs[$group['package']] = $group;
+    }
+}
+
+function db_add() {
+    global $dirs, $dists, $dbconn;
+
     check_db();
     download_sources();
 
     $db = pg_pconnect($dbconn)  or  die_status(pg_last_error($db));
-    pg_exec($db, "CREATE TEMP TABLE pkgs_atsrc (name text,dist text)") or die_status('Error creating temp table pkgs_atsrc');
-    pg_exec($db, "CREATE UNIQUE INDEX idxdis on pkgs_atsrc (name,dist)") or die_status('Error creating index on temp table');
-    pg_exec($db, "CREATE TEMP TABLE bin_atsrc (name text,bin_name text, dist text)") or die_status('Error creating temp table pkgs_atsrc');
-    pg_exec($db, "CREATE UNIQUE INDEX idxbis on bin_atsrc (name,bin_name,dist)") or die_status('Error creating index on temp table bin_atsrc');
 
+    pg_exec($db, "CREATE TEMP TABLE pkgs_atsrc (name text,dist text)")
+	or die_status('Error creating temp table pkgs_atsrc');
+    pg_exec($db, "CREATE UNIQUE INDEX idxdis on pkgs_atsrc (name,dist)")
+	or die_status('Error creating index on temp table');
+    pg_exec($db, "CREATE TEMP TABLE bin_atsrc (name text,bin_name text, dist text)")
+	or die_status('Error creating temp table pkgs_atsrc');
+    pg_exec($db, "CREATE UNIQUE INDEX idxbis on bin_atsrc (name,bin_name,dist)")
+	or die_status('Error creating index on temp table bin_atsrc');
+
+    $prepared_stmts = array(
+	'insert_pkgs_atsrc' => 'INSERT INTO pkgs_atsrc (name,dist) VALUES ($1,$2)',
+	'select_pkgs.version' => 'SELECT pkgs.version FROM pkgs WHERE name=$1 AND dist=$2',
+	'insert_pkgs' => 'INSERT INTO pkgs (name,version,dversionmangled,maint,dir,md5_atsource,bytes,dist,section,uploaders,homepage,vcs_browser,vcs_type,vcs,vcsoversource,lastpkgsourcesupdate)'
+			    .' VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,'."'0'".',now())',
+	'update_pkgs' => 'UPDATE pkgs SET version=$2, maint=$3, dir=$4, md5_atsource=$5,'
+			.'bytes=$6, section=$8, uploaders=$9, homepage=$10, vcs_browser=$11,'
+			.'vcs_type=$12, vcs=$13 WHERE name=$1 AND dist=$7',
+	'new_version' => 'UPDATE pkgs SET dversionmangled=$1, wwiz_type=NULL,'
+			.'lastpkgsourcesupdate=now() WHERE name=$2 AND dist=$3',
+	'insert_bin_atsrc' => 'INSERT INTO bin_atsrc (name,bin_name,dist) VALUES ($1, $2, $3)',
+	'insert_binpkgs' => 'INSERT INTO binpkgs (name,bin_name,dist) VALUES ($1, $2, $3)',
+    );
+
+    foreach($prepared_stmts as $name=>$stmt)
+	if (!pg_prepare($db, $name, $stmt))
+	    die_status ('Failed to prepare statement '.$stmt);
+    }
+
     foreach ($dists as $dist=>$sections) {
-        $pkgs = array();
-	$pkgsections = array();
         foreach ($sections as $section) {
             $filename=$dirs['sources_dir'] . "/$dist/$section/Sources.gz";
-            print "\nI'm parsing $dist/$section => Sources.gz\n";
+            print "\nParsing $dist/$section/Sources.gz\n";
 
-	    // Big regexp to parse a package entry
+            $fh = fopen(file_with_wrapper($filename), 'r')
+		or die_status("Could not open $filename");
 
-            // [1] Source package name
-            $regexp  = "/Package:\s($initial.*)[^a]";
-            // [2]  : Binary package name
-            $regexp .= "Binary:\s(.+)[^a]";
-            // [3]  : Epoch
-            // [4]  : Version
-            $regexp .= "Version:\s(\d+:)?(.+)[^a]";
-            $regexp .= "Priority.+";
-            // [5]  : Maintainer email address
-            $regexp .= "Maintainer:.+<([^>]+)>[^a]";
-            $regexp .= ".+";
-            // [6]  : Source package format
-            $regexp .= "Format:\s(.+)[^a]";
-            // [7]  : Package directory
-            $regexp .= "Directory:\s(.+)[^a]";
-            // [8]  : md5sum of the .diff.gz (or .debian.tar.(gz|bz2))
-            // [9]  : Size of the file in bytes
-            // [10] : Extension (.diff.gz or .debian.tar.gz)
-            $regexp .= "Files:.+\s(\S{32})\s(\d+)\s" . '\1\S+' . "(\.diff\.gz|\.debian\.tar\.(?:gz|bz2))[^a]";
-            // [11] : Uploaders field
-            // [12] : Uploaders value
-            $regexp .= "(Uploaders:\s(.+)[^a])?";
-            // [13] : DM-Upload-Allowed field
-            // [14] : DM-Upload-Allowed value
-            $regexp .= "(Dm-Upload-Allowed:\s(.+)[^a])?";
-            // [15] : Homepage field
-            // [16] : Homepage value
-            $regexp .= "(Homepage:\s(.+)[^a])?";
-            // [17] : Vcs-Browser field
-            // [18] : Vcs-Browser value
-            $regexp .= "(Vcs-Browser:\s(.+)[^a])?";
-            // [19] : Vcs-* field
-            // [20] :     vcs type
-            // [21] : Vcs-* value
-            $regexp .= "(Vcs-([A-Z][a-z]+):\s(.+)[^a])?";
-            // [21] : Checksums-* field
-            // [22] : Checksums-* value
-            $regexp .= "(Checksums-\S+:\s(.+)[^a])?";
-            $regexp .= "/Ssi";
-            $zp = gzopen($filename, "r") or die_status("Could not open $filename");
-            $extracted=tempnam("/tmp/", "$dist_$section_Sources");
-            exec("gzip -c -d $filename > $extracted");
-            $sourcesize=filesize($extracted);
-            unlink($extracted);
-            if(filesize($filename)>20) {
-                while (!gzeof($zp)) {
-                    $line='';
-                    while ($line!="\n")  {
-                        $line = gzgets ($zp,4096) ;
-                        $buff1 .= $line;
-                    }
-                    if (preg_match($regexp,$buff1,$matches))  {
-			$matches[4] = ((isset($matches[3]) && strlen($matches[3]))? $matches[3] : ''). $matches[4];
-                        if (!isset($pkgs[$matches[1]]) || is_updated($pkgs[$matches[1]][4],$matches[4],true)) {
-			    unset($matches[0]);
-                            $pkgs[$matches[1]] = $matches;
-			    $pkgsections[$matches[1]] = $section;
-			}
-                    }
-                    else {
-                        preg_match("/Package:\s(.+)[^a]Binary/im",$buff1,$matches);
-                        if(strncmp($initial,$matches[1],strlen($initial))<0) break;
-                    }
-                    $left=$sourcesize-gztell($zp);
-                    print "\rSource file Left => $left";
-                    $buff1="";
-                    unset($matches);
-                }
-            }
-            gzclose($zp) ;
+	    parse_sources($fh, 'db_add_callback', $section);
+
+            fclose($fh);
         }
-	foreach ($pkgs as $package=>$matches) {
-	    $section = $pkgsections[$matches[1]];
-            // Maintainer email address
-	    if ($matches[5]) {
-		$matches[5]=iconv("ISO-8859-1","UTF-8",$matches[5]);
-		$matches[5]=pg_escape_string($matches[5]);
+	$pkgs = db_add_callback(null, null);
+
+	foreach ($pkgs as $package=>$data) {
+
+	    $files = explode("\n", $data['files']);
+	    $md5 = null;
+	    foreach ($files as $file) {
+		if (!preg_match('/^(\S{32})\s(\d+)\s'.preg_quote($data['package'], '/').'\S+(\.diff\.gz|\.debian\.tar\.(?:gz|bz2))$/', $file, $matches))
+		    continue;
+		$data['file_md5'] = $matches[1];
+		$data['file_size'] = $matches[2];
 	    }
-            // Uploaders
-	    if ($matches[12]) {
-		$matches[12]=iconv("ISO-8859-1","UTF-8",$matches[12]);
-		$matches[12]=pg_escape_string($matches[12]);
+	    if (!$md5) {
+		print STDERR 'Could not determine md5 of '.$data['package']."\n";
+		continue;
 	    }
-	    @pg_exec($db,"INSERT INTO pkgs_atsrc (name,dist) VALUES ('$matches[1]','$dist')") OR die_status("Temp table pkgs_atsrc query error");
-	    $rst=@pg_exec($db, "INSERT INTO pkgs (name,version,dversionmangled,maint,dir,md5_atsource,bytes,dist,section,uploaders,homepage,vcs_browser,vcs_type,vcs,vcsoversource,lastpkgsourcesupdate) VALUES ('$matches[1]','$matches[4]','$matches[4]','$matches[5]','$matches[7]','$matches[8]','$matches[9]','$dist','$section','$matches[12]','$matches[16]','$matches[18]','$matches[20]','$matches[21]','0',now())");
+
+	    foreach ($data as $k=>$v) {
+		if (strpos($k, 'vcs-') !== 0)
+		    continue;
+		if ($k == 'vcs-browser')
+		    continue;
+		$data['vcs_type'] = strstr($k, '-', true);
+	    }
+
+	    $m = $data['maintainer'];
+	    $data['maintainer_email'] = substr($m, strpos($m, '<'), strpos($m, '>'));
+
+	    if ($data['maintainer_email'])
+		$data['maintainer_email'] = iconv('ISO-8859-1', 'UTF-8', $data['maintainer_email']);
+	    else
+		$data['maintainer_email'] = '';
+
+	    if ($data['uploaders'])
+		$data['uploaders'] = iconv('ISO-8859-1', 'UTF-8', $data['uploaders']);
+
+	    pg_execute($db, 'insert_pkgs_atsrc', array($data['package'], $dist));
+		or die_status("Temp table pkgs_atsrc query error");
+
+	    $rst=pg_execute($db, 'insert_pkgs', array($data['package'],$data['version'],$data['version'],$data['maintainer_email'],$data['directory'],$data['file_md5'],$data['file_size'],$dist,$data['section'],$data['uploaders'],$data['homepage'],$data['vcs-browser'],$data['vcs_type'],$data['vcs-'.$data['vcs_type']]));
 	    if (!$rst) {
-		$rsql=pg_exec($db, "SELECT pkgs.version FROM pkgs WHERE name='$matches[1]' AND dist='$dist';");
-		$version = $matches[4]; // just for safety
+		$rsql=pg_execute($db, 'select_pkgs.version', array($data['package'], $dist));
+		$version = $data['version']; // just for safety
 		while ($res_array=pg_fetch_array($rsql)) {
 		    $version = $res_array['version'];
 		}
-		$extra = '';
-		if ($version != $matches[4]) {
+
+		if ($version != $data['version']) {
 		    // only update the dversionmangled field if the versions differ
-		    $extra = ",dversionmangled='$matches[4]'";
-		    // also reset wwiz_type
-		    $extra = ",wwiz_type=NULL";
-		    // and update the lastpkgsourcesupdate field
-		    $extra = ",lastpkgsourcesupdate=now()";
+		    // in that case, also reset wwiz_type and update lastpkgsourcesupdate
+		    pg_execute($db, 'new_version', array($data['version'], $data['package'], $dist));
 		}
-		$rst=@pg_exec($db, "UPDATE pkgs SET name='$matches[1]',version='$matches[4]'$extra,maint='$matches[5]',dir='$matches[7]',md5_atsource='$matches[8]',bytes='$matches[9]',dist='$dist',section='$section',uploaders='$matches[12]',homepage='$matches[16]',vcs_browser='$matches[18]',vcs_type='$matches[20]',vcs='$matches[21]' WHERE name='$matches[1]' AND dist='$dist'") OR die_status("\nDb adding error =>" . pg_last_error() . "\n");
+
+		$rst=pg_execute($db, 'update_pkgs', array($data['package'], $data['version'], $data['maintainer_email'],
+					$data['directory'], $data['file_md5'], $data['file_size'], $dist,
+					$data['section'], $data['uploaders'], $data['homepage'], $data['vcs-browser'],
+					$data['vcs_type'], $data['vcs-'.$data['vcs_type']]))
+		    or die_status("\nDb adding error =>" . pg_last_error() . "\n");
 	    }
-	    $bin_names=split(",", $matches[2]);
+	    $bin_names = split(',', $data['binary']);
 	    foreach ($bin_names as $bin_name) {
-		pg_exec($db,"INSERT INTO bin_atsrc (name,bin_name,dist) VALUES ('$matches[1]','" . trim($bin_name) . "','$dist')") OR die_status("Temp table pkgs_atsrc query error");
-		$rst=@pg_exec($db, "INSERT INTO binpkgs (name,bin_name,dist) VALUES ('$matches[1]','" . trim($bin_name) . "','$dist')") ;
+		$bin_name = trim($bin_name);
+		pg_execute($db, 'insert_bin_atsrc', array($data['package'], $bin_name, $dist))
+		    or die_status("Temp table pkgs_atsrc query error");
+		pg_execute($db, 'insert_binpkgs', array($data['package'], $bin_name, $dist));
 	    }
 	}
     }
     clear_db($db);
     pg_close($db);
 }
+
 function db_query($pkg,$dist='%') {
     global $dirs,$dbconn;
     $db = pg_pconnect($dbconn )  or  die_status(pg_last_error($db));




More information about the Dehs-devel mailing list