[dutch-svn] r131 - trunk

Kurt Roeckx kroeckx at alioth.debian.org
Sat Jun 27 11:59:55 UTC 2009


Author: kroeckx
Date: 2009-06-27 11:59:54 +0000 (Sat, 27 Jun 2009)
New Revision: 131

Modified:
   trunk/convert
Log:
Update convert script to add the keepcase flag for hunspell.


Modified: trunk/convert
===================================================================
--- trunk/convert	2009-06-26 14:07:37 UTC (rev 130)
+++ trunk/convert	2009-06-27 11:59:54 UTC (rev 131)
@@ -4,6 +4,8 @@
 # Input files:
 UTF8FILES="basiswoorden_OpenTaal_110G.txt flexievormen_OpenTaal_110G.txt"
 LATIN1FILES=""
+KEEP_CASE="keepcaseOnderkastOfBovenEnOnderkast.txt"
+KEEP_CASE_ONLY_LOWER="keepcaseAltijdOnderkast.txt"
 
 # aff file to use
 # myspell
@@ -44,6 +46,12 @@
 # - ispell can't handle the same as aspell, but it generates the warnings
 #   when building the package instead of when installing it.
 
+# For hunspell we can also tell that some words should always
+# be written in lowercase.  There are 2 cases:
+# - abbrivations like dvd.  DVD is always wrong, Dvd is the correct way
+#   to write it at the start of a sentence, otherwise it should be dvd.
+# - words that should always be in lower case.
+
 # Temporary files:
 # Contains the full list of correct words.  Nothing seems be able to
 # support utf8, but it's generated anyway.  The latin1 version
@@ -103,6 +111,22 @@
 # myspell/hunspell output
 cp $INPUT_AFF $MYSEPLL_AFF
 munch $LATIN1FIXEDMUNCH $MYSEPLL_AFF > $LATIN1MYSPELL
+cp $LATIN1MYSPELL myspell/nl.dic.org
+# We need to modify the generated munched file, start by
+# removing the first line that contains the number of lines in the file.
+sed -e '1d' $LATIN1MYSPELL > tmp.txt
+# Words have a ; before and after the word, 1 word per line.
+for w in `sed -n -e 's/;\(.*\);.*/\1/p' $KEEP_CASE $KEEP_CASE_ONLY_LOWER`; do
+   # replace lines with the word and add c to the flags
+   sed -i -e "s/^$w\//$w\/c/" tmp.txt;
+   sed -i -e "s/^$w\$/$w\/c/" tmp.txt;
+done
+for w in `sed -n -e 's/;\(.*\);.*/\1/p' $KEEP_CASE`; do
+   # Convert first char to uppercase add a /c and append to list
+   echo $w | sed -e 's/\(.*\)/\u\1\/c/' >> tmp.txt;
+done
+wc -l < tmp.txt > $LATIN1MYSPELL
+cat tmp.txt >> $LATIN1MYSPELL
 
 # ispell
 munchlist -l $ISPELL_AFF -v -w "&'\`-_" $LATIN1FIXEDLIST > $LATIN1ISPELL




More information about the pkg-dutch-commit mailing list