[Debian-tex-commits] SVN tex-common commit + diffs: r2587 - in tex-common/trunk: debian doc

Sat Mar 17 11:30:52 CET 2007

Author: frn
Date: 2007-03-17 10:30:51 +0000 (Sat, 17 Mar 2007)
New Revision: 2587

Modified:
   tex-common/trunk/debian/changelog
   tex-common/trunk/doc/texify-tex-output
Log:
Changes to texify-tex-output:

* The document preamble is now easily customizable with a list of
  (regexp, replacement) tuples.

* More documentation (in particular, new option --algorithm explains in detail
  what the script does).

* Use '/usr/bin/python' instead of '/usr/bin/env python' (not really
  necessary, though, since the script is not even called at package build
  time--cf. changelog).

* Add the LastChangedRevision property, so that --version actually reports
  something accurate.

Update the Debian changelog.


Modified: tex-common/trunk/debian/changelog
===================================================================

--- tex-common/trunk/debian/changelog	2007-03-16 16:51:52 UTC (rev 2586)
+++ tex-common/trunk/debian/changelog	2007-03-17 10:30:51 UTC (rev 2587)
@@ -1,7 +1,16 @@
 tex-common (1.3~1) unreleased; urgency=low
 
   * Fix typography in Debian-on-TeX, thanks to Miguel de Val Borro
-    <miguel.deval at gmail.com> (closes: #413449)
+    <miguel.deval at gmail.com> (closes: #413449) [frank]
+  
+  * In the source package, replace 'tex-sed' with a Python script named
+    'texify-tex-output' to do a better job (handling all known cases so
+    far, some of which seemed rather difficult to implement in sed).
+  
+    We don't need to Build-Depend on Python, because this script is only
+    used when we generate PDF output from the DebianDoc documents
+    (Debian-TeX-Policy, TeX-on-Debian), which we don't do at build time
+    in order to avoid chicken-and-egg problems. [florent]
 
  -- Frank Küster <frank at debian.org>  Wed, 14 Mar 2007 16:10:53 +0100
 

Modified: tex-common/trunk/doc/texify-tex-output
===================================================================
--- tex-common/trunk/doc/texify-tex-output	2007-03-16 16:51:52 UTC (rev 2586)
+++ tex-common/trunk/doc/texify-tex-output	2007-03-17 10:30:51 UTC (rev 2587)
@@ -1,6 +1,6 @@
-#! /usr/bin/env python
-
-# texify-tex-output --- Change TeX into \TeX and similar stuff
+#! /usr/bin/python
+#
+# texify-tex-output --- Enhance the LaTeX output of DebianDoc tools
 # Copyright (c) 2007 Florent Rougon
 #
 # This program is free software; you can redistribute it and/or modify
@@ -21,46 +21,80 @@
 
 import sys, os, re, getopt
 
-# List of (regexp, replacement text) tuples describing the substitutions to
-# perform.
+# Call with option --algorithm for a precise explanation of how the script
+# works (or look for the definition of 'algorithm_doc' below).
 #
+#
+# ****************************************************************************
+# *                     Simple customization starts here                     *
+# ****************************************************************************
+
+# Document preamble
+# ~~~~~~~~~~~~~~~~~
+# List of (regexp, replacement text) tuples describing customize the preamble.
+#
 # For the regexp syntax, see:
 #
 #   file:///usr/share/doc/python2.5-doc/html/lib/re-syntax.html
 #
 # The replacement text is expanded by match_object.expand(), therefore you can
-# reference groups from the matching regexp with backreferences such as
-# \1, \2, etc. (even by group name). Similarly, escape sequences such as \n
-# are processed in the replacement text, therefore we have to use two
-# backslashes there to insert one backslash, even when using raw strings
-# (cf. file:///usr/share/doc/python2.5-doc/html/lib/match-objects.html).
-substitutions = [(r"\bTeX\b", r"\\TeX{}"),
-                 (r"\bpdfTeX\b", r"pdf\\TeX{}"),
-                 (r"\bMetafont\b", r"\\MF{}"),
-                 (r"\bLaTeX\b", r"\\LaTeX{}"),
-                 (r"\bConTeXt\b", r"Con\\TeX{}t"),
-                 (r"\bteTeX\b", r"te\\TeX{}"),
-                 (r"\bMiKTeX\b", r"MiK\\TeX{}")]
+# reference groups from the matching regexp with backreferences such as \1,
+# \2, etc. (even by group name with the \g<name> syntax). Similarly, escape
+# sequences such as \n are processed in the replacement text, therefore we
+# have to use two backslashes there to insert one backslash, even when using
+# raw strings (cf.
+# file:///usr/share/doc/python2.5-doc/html/lib/match-objects.html).
+#
+# For a substitution to happen on a given line, the regexp must match at
+# the beginning of that line. The replacement text can generate several lines
+# if needed, include the original line (using a group), etc.
+preamble_substitutions = [
+    (r"^(?P<input_line>\s*\\usepackage(\[[^][]*\])?\{fontenc\}.*)$",
+     r"\g<input_line>\n\\usepackage{mflogo}\n")]
 
+# How to recognize the end of the preamble
+begin_doc_re = r"^\s*\\begin\{document\}"
+
+# Document body
+# ~~~~~~~~~~~~~
+# List of (regexp, replacement text) tuples describing which substitutions are
+# to be performed on each chunk of the body text.
+#
+# The same comments as for 'preamble_substitutions' apply here (in particular,
+# a replacement text can make use of groups from the corresponding regexp),
+# except that here, we don't check if given regexp *matches* at the beginning
+# of a chunk; instead, we *search* (using regexp.search()) for the first match
+# of that regexp in the chunk.
+body_substitutions = [(r"\bTeX\b", r"\\TeX{}"),
+                      (r"\bpdfTeX\b", r"pdf\\TeX{}"),
+                      (r"\bMetafont\b", r"\\MF{}"),
+                      (r"\bLaTeX\b", r"\\LaTeX{}"),
+                      (r"\bConTeXt\b", r"Con\\TeX{}t"),
+                      (r"\bteTeX\b", r"te\\TeX{}"),
+                      (r"\bMiKTeX\b", r"MiK\\TeX{}")]
+
 # List of (command_name, number_of_args) tuples for LaTeX commands which
 # should not be subject to the regexp substitution (neither the command name,
 # nor its arguments) .
 #
-# If DebianDoc starts using an \envvar command for typesetting the names
-# of environment variables (that would be nice), it should be added to this
-# list.
+# If the LaTeX output from DebianDoc tools starts using an \envvar command for
+# typesetting the names of environment variables (that would be nice), it
+# should be added to this list.
 skipped_commands = [("file", 1)]
 
 # When processing the body of the document, if a line matches one of the
 # regexps in 'no_subst', it will not be suject to substitution at all.
 no_subst = [r"^.*\bgenerated from \\\$Id:[ \t]+.+[ \t]+\\\$"]
 
+# ****************************************************************************
+# *                      Simple customization ends here                      *
+# ****************************************************************************
 
 progname = os.path.basename(sys.argv[0])
-progversion_base = "0.1"
+progversion_base = "0.2"
 
 # Append an SVN revision part to the program version
-svn_revision_string = "$LastChangedRevision: 2510 $"
+svn_revision_string = "$LastChangedRevision$"
 svn_revision_rec = re.compile(r"^\$LastChangedRevision: ([0-9]+) \$$")
 svn_revision_mo = svn_revision_rec.match(svn_revision_string)
 
@@ -74,14 +108,107 @@
 
 
 usage = """Usage: %(progname)s [option ...] input_file output_file
-Filter DebianDoc LaTeX's output to translate TeX into \TeX, etc.
+Enhance the LaTeX output of DebianDoc tools.
 
+The document preamble is customized; in the document body, TeX is replaced
+with \\TeX{}, LaTeX with \\LaTeX{}, etc., except where it doesn't make sense
+(as in the argument of \\file, and in the SVN Id).
+
+The calling syntax allows this script to be specified as the argument to the
+-s option of commands such as debiandoc2latexpdf.
+
 Options:
+      --algorithm              explain the algorithm used
       --help                   display this message and exit
       --version                output version information and exit""" \
   % {"progname": progname}
 
 
+algorithm_doc = """\
+The algorithm used in %(progname)s is the following. First, read the
+preamble and customize it (e.g., to add '\\usepackage{mflogo}' after
+the line loading 'fontenc'). This is done the following way:
+
+  Each line of the preamble is read separately. If it matches one of the
+  regular expressions in 'preamble_substitutions', the corresponding
+  replacement text is substituted, and no other substitution is done on that
+  line (i.e., the first regexp that matches 'wins').
+
+  Lines that don't match any regexp in 'preamble_substitutions' are output
+  verbatim.
+
+  Note: 'preamble_substitutions' is a list of tuples; the first element of
+        each tuple is a regexp, and the second element is its corresponding
+        replacement text.
+
+  The end of the preamble is detected when a line matches the regexp
+  in 'begin_doc_re'.
+
+Then, process the body of the document line by line. If a line matches at
+least one of the regular expressions in 'no_subst', it is dumped verbatim.
+Currently, this is used to avoid changing 'Debian-TeX-Policy.sgml' into
+'Debian-\\TeX{}-Policy.sgml' in the Id generated by subversion:
+
+  $Id$
+
+Other lines in the document body go through the following filter:
+
+  1. Split the line into chunks separated by LaTeX commands listed in
+     'skipped_commands' (currently, only \\file).
+
+  2. Such commands and their arguments are dumped verbatim. This avoids
+     mangling file names that contain the string 'TeX', such as
+     '/etc/texmf/texmf.d/05TeXMF.cnf'.
+
+     For each of these commands, the number of arguments is supposed to be
+     fixed, as specified in %(progname)s. But it is possible to specify
+     that e.g., \\file takes one argument, and \\othercommand takes two
+     arguments.
+
+  3. The remaining chunks of text each go through the substitution process,
+     which works as follows:
+
+     Initialisation:
+
+       index = 0  --- which means, start at the beginning of the chunk
+
+     Loop:
+
+     (a) Look for the first match of each regular expression in
+         'body_substitutions' (the regexp is the first element of each tuple),
+         starting at 'index' in the chunk. If no regexp matches, it means
+         there is nothing left to replace in the chunk, therefore we break the
+         loop.
+
+     (b) Choose the regexp that matched earliest in the chunk, dump the text
+         from 'index' to the beginning of the regexp match, and write the
+         replacement text for the regexp (which is given in the second element
+         of the tuple in 'body_substitutions' that contains the regexp, and
+         can make use of groups that matched in the regexp---specified as \\1,
+         \\2, or even by group name).
+
+     (c) Let 'index' point right after the end of the regexp match and start a
+         new loop iteration, provided 'index' doesn't point to the end of the
+         chunk yet.
+
+     The idea behind this loop is to proceed as a human would do, instead of
+     the simpler way, which would be: successively replace all occurrences of
+     each regexp in 'body_substitutions' in the chunk. This simpler way would
+     cause problems, because a replacement text for a given regexp could be
+     later matched by another regexp, and be subject to a second (recursive)
+     replacement, which is generally not wanted and forces one to be very
+     careful about the order in which the regexps are listed.
+
+Currently, the arguments of LaTeX commands in step 2 are supposed to all fit
+on the same line as the command, and they are recognized based on brace
+matching, with escaped braces \\{ and \\} properly handled (they are not
+confused with braces which delimit arguments). Due to this single-line
+limitation, the arguments cannot contain TeX comments. Currently, this is
+sufficient for the \\file commmand calls produced by DebianDoc, which is why
+this relatively simple design was chosen.""" \
+% {"progname": progname}
+
+
 class error(Exception):
     pass
 
@@ -92,15 +219,29 @@
     "Exception raised for obvious bugs (when an assertion is false)."
 
 
-def process_preamble(input_stream, output_stream, lineno):
-    fontenc_rec = re.compile(r"^\s*\\usepackage(\[[^][]*\])?\{fontenc\}")
-    begin_doc_rec = re.compile(r"^\s*\\begin\{document\}")
+def compile_regexps(seq):
+    res = []
 
+    for e in seq:
+        res.append((re.compile(e[0]), e[1]))
+
+    return res
+
+
+def process_preamble(input_stream, output_stream, preamble_substitutions,
+                     begin_doc_re, lineno):
+    begin_doc_rec = re.compile(begin_doc_re)
+    subs = compile_regexps(preamble_substitutions)
+
     for line in input_stream:
         obuf = [line]                   # output buffer
         
-        if fontenc_rec.match(line):
-            obuf.append("\\usepackage{mflogo}\n")
+        for regexp, repl in subs:
+            mo = regexp.match(line)
+            if mo is not None:
+                # Replacement text
+                obuf = [mo.expand(repl)]
+                break
 
         output_stream.write(''.join(obuf))
         lineno += 1
@@ -116,7 +257,9 @@
 
     If all elements are equal to -1, return None."""
 
+    # Smallest number found so far, among those that are different from -1
     min_so_far = None
+    # Index of this number in 'l'
     index_of_min_so_far = None
 
     for i in range(len(l)):
@@ -129,6 +272,12 @@
 
 
 def skip_cmd_and_args(obuf, line, cmd_start, command, nargs, lineno):
+    """Skip a LaTeX command and its arguments.
+
+    The command 'command' is supposed to start at position 'cmd_start' in
+    'line', and accept 'nargs' mandatory arguments.
+
+    """
     start_of_cmd_call_rec = re.compile(r"\\%s[ \t]*\{" % command)
 
     mo = start_of_cmd_call_rec.match(line, pos=cmd_start)
@@ -275,7 +424,8 @@
 def process_command_line():
     try:
         opts, args = getopt.getopt(sys.argv[1:], "",
-                                   ["help",
+                                   ["algorithm",
+                                    "help",
                                     "version"])
     except getopt.GetoptError, message:
         sys.stderr.write(usage + "\n")
@@ -284,7 +434,10 @@
     params = {}
 
     for option, value in opts:
-        if option == "--help":
+        if option == "--algorithm":
+            print algorithm_doc
+            return ("exit", 0)
+        elif option == "--help":
             print usage
             return ("exit", 0)
         elif option == "--version":
@@ -304,15 +457,6 @@
     return ("continue", params)
 
 
-def compile_regexps(seq):
-    res = []
-
-    for e in seq:
-        res.append((re.compile(e[0]), e[1]))
-
-    return res
-
-
 def main():
     action, p = process_command_line()
     if action == "exit":
@@ -324,8 +468,10 @@
     # Number of the input line that will be read next, starting from 1
     lineno = 1
     
-    lineno = process_preamble(input_stream, output_stream, lineno)
-    process_body(input_stream, output_stream, substitutions, no_subst,
+    lineno = process_preamble(input_stream, output_stream,
+                              preamble_substitutions, begin_doc_re,
+                              lineno)
+    process_body(input_stream, output_stream, body_substitutions, no_subst,
                  lineno)
 
     sys.exit(0)


Property changes on: tex-common/trunk/doc/texify-tex-output
___________________________________________________________________
Name: svn:keywords
   - Id
   + Id LastChangedRevision