[Pkg-isocodes-devel] [SCM] ISO language, territory, currency, script codes and their translations branch, master, updated. ba6f0b662aca3fd2ea0d902813b0a39708d5521c

Tobias Quathamer toddy at debian.org
Thu Jun 13 13:19:04 UTC 2013


The following commit has been merged in the master branch:
commit 4b206c6ccc6d68d097f0aa09e79ba8d1ba6eb394
Author: Tobias Quathamer <toddy at debian.org>
Date:   Thu Jun 13 15:12:34 2013 +0200

    Rewrite parsing script for new SIL tabular file

diff --git a/iso_639_3/iso-dis-639-tab-parse.py b/iso_639_3/iso-dis-639-tab-parse.py
index 1e6af57..b159e10 100755
--- a/iso_639_3/iso-dis-639-tab-parse.py
+++ b/iso_639_3/iso-dis-639-tab-parse.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 #
 # Copyright © 2005 Alastair McKinstry <mckinstry at computer.org>
-# Copyright © 2008,2012 Tobias Quathamer <toddy at debian.org>
+# Copyright © 2008,2012,2013 Tobias Quathamer <toddy at debian.org>
 #
 # This file is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -23,12 +23,11 @@ Parse the SIL.org iso_639_3.tab file and create
 an XML file for our own use.
 """
 
-# The Name_Index file has the same fields as the normal
-# tab file. Extract the additional entries and save them
-# for later.
-alternative_names = {}
-last_code = ''
-last_name = ''
+# The Name_Index file only has the fields
+# Id, Print_Name, and Inverted_Name.
+# There may be multiple lines with the same Id.
+# Extract only the inverted names which differ.
+inverted_names = {}
 names = open("iso_639_3_Name_Index.tab")
 for li in names.readlines():
 	# Split the line into parts
@@ -36,49 +35,21 @@ for li in names.readlines():
 	# Reverse the parts, because Python's pop() function is much
 	# faster at the end of a list instead of at the start of a list
 	parts.reverse()
-	# Get the code
+	# Get the fields
 	code = parts.pop()
-	# Take away unneeded parts
-	status = parts.pop()
-	partner_agency = parts.pop()
-	iso_639_3 = parts.pop()
-	iso_639_2 = parts.pop()
-	b_code = parts.pop()
-	bt_equiv = parts.pop()
-	iso_639_1 = parts.pop()
-	# At this point, we are at 'reference_name'. This field may
-	# contain a quote sign, so we have to look for it and append
-	# the next field, completing the field.
-	reference_name = parts.pop()
-	if reference_name.startswith('"'):
-		reference_name = reference_name + parts.pop()
-		# Now strip the quote signs
-		reference_name = reference_name.strip('"')
-	# If we have already seen the code, we store the name
-	# for later use, else skip the rest
-	if code == last_code:
-		if code in alternative_names:
-			# If there is already an entry, just append the current name
-			alternative_names[code].append(reference_name)
-		else:
-			# The is no entry yet, so save the previous name, too
-			alternative_names[code] = [last_name, reference_name]
-	# Store code and reference_name for comparison
-	last_code = code
-	last_name = reference_name
+	print_name = parts.pop()
+	inverted_name = parts.pop().strip()
+	if inverted_name != print_name:
+		inverted_names[print_name] = inverted_name
 names.close()
 
-# Set up a dictionary for the one letter abbreviations
-status_codes = {'A': 'Active', 'R': 'Retired'}
-
 def create_iso_639_3_entry(entry):
 	result = '\t<iso_639_3_entry\n'
 	result += '\t\tid="%s"\n' % entry['code']
-	if entry['iso_639_1'] != '':
-		result += '\t\tpart1_code="%s"\n' % entry['iso_639_1']
-	if entry['iso_639_2'] != '':
-		result += '\t\tpart2_code="%s"\n' % entry['iso_639_2']
-	result += '\t\tstatus="%s"\n' % entry['status']
+	if entry['part1'] != '':
+		result += '\t\tpart1_code="%s"\n' % entry['part1']
+	if entry['part2t'] != '':
+		result += '\t\tpart2_code="%s"\n' % entry['part2t']
 	result += '\t\tscope="%s"\n' % entry['element_scope']
 	result += '\t\ttype="%s"\n' % entry['language_type']
 	if 'inverted_name' in entry:
@@ -101,7 +72,7 @@ This file gives a list of all languages in the ISO 639-3
 standard, and is used to provide translations via gettext
 
 Copyright © 2005 Alastair McKinstry <mckinstry at computer.org>
-Copyright © 2008,2012 Tobias Quathamer <toddy at debian.org>
+Copyright © 2008,2012,2013 Tobias Quathamer <toddy at debian.org>
 
     This file is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
@@ -127,7 +98,6 @@ Source: <http://www.sil.org/iso639-3/>
 		id		CDATA	#REQUIRED
 		part1_code	CDATA	#IMPLIED
 		part2_code	CDATA	#IMPLIED
-		status		CDATA	#REQUIRED
 		scope		CDATA   #REQUIRED
 		type		CDATA	#REQUIRED
 		inverted_name	CDATA	#IMPLIED
@@ -142,9 +112,6 @@ Source: <http://www.sil.org/iso639-3/>
 # The first line only contains a header, so discard it
 tabular_file.readline()
 
-# Set up a dictionary for XML element 'iso_639_3_entry'
-iso_639_3_entry = {}
-
 for li in tabular_file.readlines():
 	# Split the line into parts
 	parts = li.split('\t')
@@ -153,51 +120,28 @@ for li in tabular_file.readlines():
 	parts.reverse()
 	# Take away the parts which are always at the same position
 	code = parts.pop()
-	status = parts.pop()
-	partner_agency = parts.pop()
-	iso_639_3 = parts.pop()
-	iso_639_2 = parts.pop()
-	b_code = parts.pop()
-	bt_equiv = parts.pop()
-	iso_639_1 = parts.pop()
-	# At this point, we are at 'reference_name'. This field may
-	# contain a quote sign, so we have to look for it and append
-	# the next field, completing the field.
-	reference_name = parts.pop()
-	if reference_name.startswith('"'):
-		reference_name = reference_name + parts.pop()
-		# Now strip the quote signs
-		reference_name = reference_name.strip('"')
+	part2b = parts.pop()
+	part2t = parts.pop()
+	part1 = parts.pop()
 	element_scope = parts.pop()
 	language_type = parts.pop()
-	documentation = parts.pop()
-	# Write the last entry, before starting a new one
-	if iso_639_3_entry.has_key('code'):
-		entry = create_iso_639_3_entry(iso_639_3_entry)
-		xml_file.write(entry)
-		iso_639_3_entry = {}
+	reference_name = parts.pop()
+	comment = parts.pop()
 	# Assemble the iso_639_3_entry
+	iso_639_3_entry = {}
 	iso_639_3_entry['code'] = code
-	iso_639_3_entry['status'] = status_codes[status]
-	iso_639_3_entry['iso_639_1'] = iso_639_1
-	iso_639_3_entry['iso_639_2'] = iso_639_2
+	iso_639_3_entry['part2b'] = part2b
+	iso_639_3_entry['part2t'] = part2t
+	iso_639_3_entry['part1'] = part1
 	iso_639_3_entry['element_scope'] = element_scope
 	iso_639_3_entry['language_type'] = language_type
 	iso_639_3_entry['reference_name'] = reference_name
-	# If there are alternative names, try to find the one
-	# with a comma, which is treated as the inverted name.
-	if code in alternative_names:
-		for other_name in alternative_names[code]:
-			# Skip the reference_name
-			if other_name == reference_name:
-				continue
-			# If there is a comma, use this as the inverted form
-			if ',' in other_name:
-				iso_639_3_entry['inverted_name'] = other_name
-
-# Finally, write the last entry and close the XML file
-entry = create_iso_639_3_entry(iso_639_3_entry)
-xml_file.write(entry)
+	if reference_name in inverted_names:
+		iso_639_3_entry['inverted_name'] = inverted_names[reference_name]
+	entry = create_iso_639_3_entry(iso_639_3_entry)
+	xml_file.write(entry)
+
+# Finally, close the XML file
 xml_file.write('</iso_639_3_entries>\n')
 xml_file.close()
 

-- 
ISO language, territory, currency, script codes and their translations



More information about the Pkg-isocodes-devel mailing list