[Pkg-isocodes-devel] [SCM] ISO language, territory, currency, script codes and their translations branch, master, updated. ba6f0b662aca3fd2ea0d902813b0a39708d5521c
Tobias Quathamer
toddy at debian.org
Thu Jun 13 13:19:04 UTC 2013
The following commit has been merged in the master branch:
commit 4b206c6ccc6d68d097f0aa09e79ba8d1ba6eb394
Author: Tobias Quathamer <toddy at debian.org>
Date: Thu Jun 13 15:12:34 2013 +0200
Rewrite parsing script for new SIL tabular file
diff --git a/iso_639_3/iso-dis-639-tab-parse.py b/iso_639_3/iso-dis-639-tab-parse.py
index 1e6af57..b159e10 100755
--- a/iso_639_3/iso-dis-639-tab-parse.py
+++ b/iso_639_3/iso-dis-639-tab-parse.py
@@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
#
# Copyright © 2005 Alastair McKinstry <mckinstry at computer.org>
-# Copyright © 2008,2012 Tobias Quathamer <toddy at debian.org>
+# Copyright © 2008,2012,2013 Tobias Quathamer <toddy at debian.org>
#
# This file is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
@@ -23,12 +23,11 @@ Parse the SIL.org iso_639_3.tab file and create
an XML file for our own use.
"""
-# The Name_Index file has the same fields as the normal
-# tab file. Extract the additional entries and save them
-# for later.
-alternative_names = {}
-last_code = ''
-last_name = ''
+# The Name_Index file only has the fields
+# Id, Print_Name, and Inverted_Name.
+# There may be multiple lines with the same Id.
+# Extract only the inverted names which differ.
+inverted_names = {}
names = open("iso_639_3_Name_Index.tab")
for li in names.readlines():
# Split the line into parts
@@ -36,49 +35,21 @@ for li in names.readlines():
# Reverse the parts, because Python's pop() function is much
# faster at the end of a list instead of at the start of a list
parts.reverse()
- # Get the code
+ # Get the fields
code = parts.pop()
- # Take away unneeded parts
- status = parts.pop()
- partner_agency = parts.pop()
- iso_639_3 = parts.pop()
- iso_639_2 = parts.pop()
- b_code = parts.pop()
- bt_equiv = parts.pop()
- iso_639_1 = parts.pop()
- # At this point, we are at 'reference_name'. This field may
- # contain a quote sign, so we have to look for it and append
- # the next field, completing the field.
- reference_name = parts.pop()
- if reference_name.startswith('"'):
- reference_name = reference_name + parts.pop()
- # Now strip the quote signs
- reference_name = reference_name.strip('"')
- # If we have already seen the code, we store the name
- # for later use, else skip the rest
- if code == last_code:
- if code in alternative_names:
- # If there is already an entry, just append the current name
- alternative_names[code].append(reference_name)
- else:
- # The is no entry yet, so save the previous name, too
- alternative_names[code] = [last_name, reference_name]
- # Store code and reference_name for comparison
- last_code = code
- last_name = reference_name
+ print_name = parts.pop()
+ inverted_name = parts.pop().strip()
+ if inverted_name != print_name:
+ inverted_names[print_name] = inverted_name
names.close()
-# Set up a dictionary for the one letter abbreviations
-status_codes = {'A': 'Active', 'R': 'Retired'}
-
def create_iso_639_3_entry(entry):
result = '\t<iso_639_3_entry\n'
result += '\t\tid="%s"\n' % entry['code']
- if entry['iso_639_1'] != '':
- result += '\t\tpart1_code="%s"\n' % entry['iso_639_1']
- if entry['iso_639_2'] != '':
- result += '\t\tpart2_code="%s"\n' % entry['iso_639_2']
- result += '\t\tstatus="%s"\n' % entry['status']
+ if entry['part1'] != '':
+ result += '\t\tpart1_code="%s"\n' % entry['part1']
+ if entry['part2t'] != '':
+ result += '\t\tpart2_code="%s"\n' % entry['part2t']
result += '\t\tscope="%s"\n' % entry['element_scope']
result += '\t\ttype="%s"\n' % entry['language_type']
if 'inverted_name' in entry:
@@ -101,7 +72,7 @@ This file gives a list of all languages in the ISO 639-3
standard, and is used to provide translations via gettext
Copyright © 2005 Alastair McKinstry <mckinstry at computer.org>
-Copyright © 2008,2012 Tobias Quathamer <toddy at debian.org>
+Copyright © 2008,2012,2013 Tobias Quathamer <toddy at debian.org>
This file is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -127,7 +98,6 @@ Source: <http://www.sil.org/iso639-3/>
id CDATA #REQUIRED
part1_code CDATA #IMPLIED
part2_code CDATA #IMPLIED
- status CDATA #REQUIRED
scope CDATA #REQUIRED
type CDATA #REQUIRED
inverted_name CDATA #IMPLIED
@@ -142,9 +112,6 @@ Source: <http://www.sil.org/iso639-3/>
# The first line only contains a header, so discard it
tabular_file.readline()
-# Set up a dictionary for XML element 'iso_639_3_entry'
-iso_639_3_entry = {}
-
for li in tabular_file.readlines():
# Split the line into parts
parts = li.split('\t')
@@ -153,51 +120,28 @@ for li in tabular_file.readlines():
parts.reverse()
# Take away the parts which are always at the same position
code = parts.pop()
- status = parts.pop()
- partner_agency = parts.pop()
- iso_639_3 = parts.pop()
- iso_639_2 = parts.pop()
- b_code = parts.pop()
- bt_equiv = parts.pop()
- iso_639_1 = parts.pop()
- # At this point, we are at 'reference_name'. This field may
- # contain a quote sign, so we have to look for it and append
- # the next field, completing the field.
- reference_name = parts.pop()
- if reference_name.startswith('"'):
- reference_name = reference_name + parts.pop()
- # Now strip the quote signs
- reference_name = reference_name.strip('"')
+ part2b = parts.pop()
+ part2t = parts.pop()
+ part1 = parts.pop()
element_scope = parts.pop()
language_type = parts.pop()
- documentation = parts.pop()
- # Write the last entry, before starting a new one
- if iso_639_3_entry.has_key('code'):
- entry = create_iso_639_3_entry(iso_639_3_entry)
- xml_file.write(entry)
- iso_639_3_entry = {}
+ reference_name = parts.pop()
+ comment = parts.pop()
# Assemble the iso_639_3_entry
+ iso_639_3_entry = {}
iso_639_3_entry['code'] = code
- iso_639_3_entry['status'] = status_codes[status]
- iso_639_3_entry['iso_639_1'] = iso_639_1
- iso_639_3_entry['iso_639_2'] = iso_639_2
+ iso_639_3_entry['part2b'] = part2b
+ iso_639_3_entry['part2t'] = part2t
+ iso_639_3_entry['part1'] = part1
iso_639_3_entry['element_scope'] = element_scope
iso_639_3_entry['language_type'] = language_type
iso_639_3_entry['reference_name'] = reference_name
- # If there are alternative names, try to find the one
- # with a comma, which is treated as the inverted name.
- if code in alternative_names:
- for other_name in alternative_names[code]:
- # Skip the reference_name
- if other_name == reference_name:
- continue
- # If there is a comma, use this as the inverted form
- if ',' in other_name:
- iso_639_3_entry['inverted_name'] = other_name
-
-# Finally, write the last entry and close the XML file
-entry = create_iso_639_3_entry(iso_639_3_entry)
-xml_file.write(entry)
+ if reference_name in inverted_names:
+ iso_639_3_entry['inverted_name'] = inverted_names[reference_name]
+ entry = create_iso_639_3_entry(iso_639_3_entry)
+ xml_file.write(entry)
+
+# Finally, close the XML file
xml_file.write('</iso_639_3_entries>\n')
xml_file.close()
--
ISO language, territory, currency, script codes and their translations
More information about the Pkg-isocodes-devel
mailing list