[libfile-bom-perl] 01/03: Imported Upstream version 0.15
gregor herrmann
gregoa at debian.org
Tue Dec 15 17:32:42 UTC 2015
This is an automated email from the git hooks/post-receive script.
gregoa pushed a commit to branch master
in repository libfile-bom-perl.
commit c5d8e2d90b9217a9a1b9d161fc6cbe24528e6a7c
Author: gregor herrmann <gregoa at debian.org>
Date: Tue Dec 15 18:31:50 2015 +0100
Imported Upstream version 0.15
---
Changes | 3 +
MANIFEST | 1 +
META.json | 50 ++++
META.yml | 53 ++--
Makefile.PL | 66 ++---
README | 746 +++++++++++++++++++++++++++++---------------------------
lib/File/BOM.pm | 4 +-
7 files changed, 502 insertions(+), 421 deletions(-)
diff --git a/Changes b/Changes
index 3151854..5356f46 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,8 @@
File::BOM changes document
+0.15 - Mon Dec 14 2015
+ - Fix spelling errors spotted by debian team. Thanks to Gregor Herrmann.
+
0.14 - Wed Oct 4 2006
- More workarounds for platforms with odd read() behaviour.
diff --git a/MANIFEST b/MANIFEST
index 3fa107e..dbdfb84 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -15,3 +15,4 @@ t/99..shutdown.t
t/data/broken_bom.txt
t/lib/Test/Framework.pm
TODO
+META.json
diff --git a/META.json b/META.json
new file mode 100644
index 0000000..d319f74
--- /dev/null
+++ b/META.json
@@ -0,0 +1,50 @@
+{
+ "abstract" : "Utilities for handling Byte Order Marks",
+ "author" : [
+ "Matt Lawrence E<lt>mattlaw at cpan.orgE<gt>"
+ ],
+ "dynamic_config" : 1,
+ "generated_by" : "Module::Build version 0.4212",
+ "license" : [
+ "perl_5"
+ ],
+ "meta-spec" : {
+ "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
+ "version" : "2"
+ },
+ "name" : "File-BOM",
+ "prereqs" : {
+ "build" : {
+ "requires" : {
+ "Module::Build" : "0.20",
+ "Test::Exception" : "0.20",
+ "Test::More" : "0.10"
+ }
+ },
+ "configure" : {
+ "requires" : {
+ "Module::Build" : "0.42"
+ }
+ },
+ "runtime" : {
+ "requires" : {
+ "Encode" : "1.99",
+ "Readonly" : "0.06",
+ "perl" : "v5.8.3"
+ }
+ }
+ },
+ "provides" : {
+ "File::BOM" : {
+ "file" : "lib/File/BOM.pm",
+ "version" : "0.15"
+ }
+ },
+ "release_status" : "stable",
+ "resources" : {
+ "license" : [
+ "http://dev.perl.org/licenses/"
+ ]
+ },
+ "version" : "0.15"
+}
diff --git a/META.yml b/META.yml
index 7ff3e7d..1e4304b 100644
--- a/META.yml
+++ b/META.yml
@@ -1,25 +1,28 @@
----
-name: File-BOM
-version: 0.14
-author:
- - 'Matt Lawrence E<lt>mattlaw at cpan.orgE<gt>'
-abstract: Utilities for handling Byte Order Marks
-license: perl
-resources:
- license: http://dev.perl.org/licenses/
-requires:
- Encode: 1.99
- Readonly: 0.06
- perl: 5.8.3
-build_requires:
- Module::Build: 0.20
- Test::Exception: 0.20
- Test::More: 0.10
-provides:
- File::BOM:
- file: lib/File/BOM.pm
- version: 0.14
-generated_by: Module::Build version 0.28
-meta-spec:
- url: http://module-build.sourceforge.net/META-spec-v1.2.html
- version: 1.2
+---
+abstract: 'Utilities for handling Byte Order Marks'
+author:
+ - 'Matt Lawrence E<lt>mattlaw at cpan.orgE<gt>'
+build_requires:
+ Module::Build: '0.20'
+ Test::Exception: '0.20'
+ Test::More: '0.10'
+configure_requires:
+ Module::Build: '0.42'
+dynamic_config: 1
+generated_by: 'Module::Build version 0.4212, CPAN::Meta::Converter version 2.150001'
+license: perl
+meta-spec:
+ url: http://module-build.sourceforge.net/META-spec-v1.4.html
+ version: '1.4'
+name: File-BOM
+provides:
+ File::BOM:
+ file: lib/File/BOM.pm
+ version: '0.15'
+requires:
+ Encode: '1.99'
+ Readonly: '0.06'
+ perl: v5.8.3
+resources:
+ license: http://dev.perl.org/licenses/
+version: '0.15'
diff --git a/Makefile.PL b/Makefile.PL
index 192903a..d7086b7 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -1,31 +1,35 @@
-# Note: this file was auto-generated by Module::Build::Compat version 0.03
-
- unless (eval "use Module::Build::Compat 0.02; 1" ) {
- print "This module requires Module::Build to install itself.\n";
-
- require ExtUtils::MakeMaker;
- my $yn = ExtUtils::MakeMaker::prompt
- (' Install Module::Build now from CPAN?', 'y');
-
- unless ($yn =~ /^y/i) {
- die " *** Cannot install without Module::Build. Exiting ...\n";
- }
-
- require Cwd;
- require File::Spec;
- require CPAN;
-
- # Save this 'cause CPAN will chdir all over the place.
- my $cwd = Cwd::cwd();
-
- CPAN::Shell->install('Module::Build::Compat');
- CPAN::Shell->expand("Module", "Module::Build::Compat")->uptodate
- or die "Couldn't install Module::Build, giving up.\n";
-
- chdir $cwd or die "Cannot chdir() back to $cwd: $!";
- }
- eval "use Module::Build::Compat 0.02; 1" or die $@;
-
- Module::Build::Compat->run_build_pl(args => \@ARGV);
- require Module::Build;
- Module::Build::Compat->write_makefile(build_class => 'Module::Build');
+# Note: this file was auto-generated by Module::Build::Compat version 0.4212
+require 5.008003;
+
+ unless (eval "use Module::Build::Compat 0.02; 1" ) {
+ print "This module requires Module::Build to install itself.\n";
+
+ require ExtUtils::MakeMaker;
+ my $yn = ExtUtils::MakeMaker::prompt
+ (' Install Module::Build now from CPAN?', 'y');
+
+ unless ($yn =~ /^y/i) {
+ die " *** Cannot install without Module::Build. Exiting ...\n";
+ }
+
+ require Cwd;
+ require File::Spec;
+ require CPAN;
+
+ # Save this 'cause CPAN will chdir all over the place.
+ my $cwd = Cwd::cwd();
+
+ CPAN::Shell->install('Module::Build::Compat');
+ CPAN::Shell->expand("Module", "Module::Build::Compat")->uptodate
+ or die "Couldn't install Module::Build, giving up.\n";
+
+ chdir $cwd or die "Cannot chdir() back to $cwd: $!";
+ }
+ eval "use Module::Build::Compat 0.02; 1" or die $@;
+
+ Module::Build::Compat->run_build_pl(args => \@ARGV);
+ my $build_script = 'Build';
+ $build_script .= '.com' if $^O eq 'VMS';
+ exit(0) unless(-e $build_script); # cpantesters convention
+ require Module::Build;
+ Module::Build::Compat->write_makefile(build_class => 'Module::Build');
diff --git a/README b/README
index bc66c7a..50f4d8a 100644
--- a/README
+++ b/README
@@ -1,363 +1,383 @@
-NAME
- File::BOM - Utilities for handling Byte Order Marks
-
-SYNOPSIS
- use File::BOM qw( :all )
-
- high-level functions
- # read a file with encoding from the BOM:
- open_bom(FH, $file)
- open_bom(FH, $file, ':utf8') # the same but with a default encoding
-
- # get encoding too
- $encoding = open_bom(FH, $file, ':utf8');
-
- # open a potentially unseekable file:
- ($encoding, $spillage) = open_bom(FH, $file, ':utf8');
-
- # change encoding of an open handle according to BOM
- $encoding = defuse(*HANDLE);
- ($encoding, $spillage) = defuse(*HANDLE);
-
- # Decode a string according to leading BOM:
- $unicode = decode_from_bom($string_with_bom);
-
- # Decode a string and get the encoding:
- ($unicode, $encoding) = decode_from_bom($string_with_bom)
-
- PerlIO::via interface
- # Read the Right Thing from a unicode file with BOM:
- open(HANDLE, '<:via(File::BOM)', $filename)
-
- # Writing little-endian UTF-16 file with BOM:
- open(HANDLE, '>:encoding(UTF-16LE):via(File::BOM)', $filename)
-
- lower-level functions
- # read BOM encoding from a filehandle:
- $encoding = get_encoding_from_filehandle(FH)
-
- # Get encoding even if FH is unseekable:
- ($encoding, $spillage) = get_encoding_from_filehandle(FH);
-
- # Get encoding from a known unseekable handle:
- ($encdoing, $spillage) = get_encoding_from_stream(FH);
-
- # get encoding and BOM length from BOM at start of string:
- ($encoding, $offset) = get_encoding_from_bom($string);
-
- variables
- # print a BOM for a known encoding
- print FH $enc2bom{$encoding};
-
- # get an encoding from a known BOM
- $enc = $bom2enc{$bom}
-
-DESCRIPTION
- This module provides functions for handling unicode byte order marks,
- which are to be found at the beginning of some files and streams.
-
- For details about what a byte order mark is, see
- <http://www.unicode.org/unicode/faq/utf_bom.html#BOM>
-
- The intention of File::BOM is for files with BOMs to be readable as
- seamlessly as possible, regardless of the encoding used. To that end,
- several different interfaces are available, as shown in the synopsis
- above.
-
-EXPORTS
- Nothing by default.
-
- symbols
- * open_bom()
- * defuse()
- * decode_from_bom()
- * get_encoding_from_filehandle()
- * get_encoding_from_stream()
- * get_encoding_from_bom()
- * %bom2enc
- * %enc2bom
-
- tags
- * :all
- All of the above
-
- * :subs
- subroutines only
-
- * :vars
- just %bom2enc and %enc2bom
-
-VARIABLES
- %bom2enc
- Maps Byte Order marks to their encodings.
-
- The keys of this hash are strings which represent the BOMs, the values
- are their encodings, in a format which is understood by Encode
-
- The encodings represented in this hash are: UTF-8, UTF-16BE, UTF-16LE,
- UTF-32BE and UTF-32LE
-
- %enc2bom
- A reverse-lookup hash for bom2enc, with a few aliases used in Encode,
- namely utf8, iso-10646-1 and UCS-2.
-
- Note that UTF-16, UTF-32 and UCS-4 are not included in this hash. Mainly
- because Encode::encode automatically puts BOMs on output. See
- Encode::Unicode
-
-FUNCTIONS
- open_bom
- $encoding = open_bom(HANDLE, $filename, $default_mode)
-
- ($encoding, $spill) = open_bom(HANDLE, $filename, $default_mode)
-
- opens HANDLE for reading on $filename, setting the mode to the
- appropriate encoding for the BOM stored in the file.
-
- On failure, a fatal error is raised, see the DIAGNOSTICS section for
- details on how to catch these. This is in order to allow the return
- value(s) to be used for other purposes.
-
- If the file doesn't contain a BOM, $default_mode is used instead. Hence:
-
- open_bom(FH, 'my_file.txt', ':utf8')
-
- Opens my_file.txt for reading in an appropriate encoding found from the
- BOM in that file, or as a UTF-8 file if none is found.
-
- In the absense of a $default_mode argument, the following 2 calls should
- be equivalent:
-
- open_bom(FH, 'no_bom.txt');
-
- open(FH, '<', 'no_bom.txt');
-
- If an undefined value is passed as the handle, a symbol will be
- generated for it like open() does:
-
- # create filehandle on the fly
- $enc = open_bom(my $fh, $filename, ':utf8');
- $line = <$fh>;
-
- The filehandle will be cued up to read after the BOM. Unseekable files
- (e.g. fifos) will cause croaking, unless called in list context to catch
- spillage from the handle. Any spillage will be automatically decoded
- from the encoding, if found.
-
- e.g.
-
- # croak if my_socket is unseekable
- open_bom(FH, 'my_socket');
-
- # keep spillage if my_socket is unseekable
- ($encoding, $spillage) = open_bom(FH, 'my_socket');
-
- # discard any spillage from open_bom
- ($encoding) = open_bom(FH, 'my_socket');
-
- defuse
- $enc = defuse(FH);
-
- ($enc, $spill) = defuse(FH);
-
- FH should be a filehandle opened for reading, it will have the relevant
- encoding layer pushed onto it be binmode if a BOM is found. Spillage
- should be Unicode, not bytes.
-
- Any uncaptured spillage will be silently lost. If the handle is
- unseekable, use list context to avoid data loss.
-
- If no BOM is found, the mode will be unaffected.
-
- decode_from_bom
- $unicode_string = decode_from_bom($string, $default, $check)
-
- ($unicode_string, $encoding) = decode_from_bom($string, $default, $check)
-
- Reads a BOM from the beginning of $string, decodes $string (minus the
- BOM) and returns it to you as a perl unicode string.
-
- if $string doesn't have a BOM, $default is used instead.
-
- $check, if supplied, is passed to Encode::decode as the third argument.
-
- If there's no BOM and no default, the original string is returned and
- encoding is ''.
-
- See Encode
-
- get_encoding_from_filehandle
- $encoding = get_encoding_from_filehandle(HANDLE)
-
- ($encoding, $spillage) = get_encoding_from_filehandle(HANDLE)
-
- Returns the encoding found in the given filehandle.
-
- The handle should be opened in a non-unicode way (e.g. mode '<:bytes')
- so that the BOM can be read in its natural state.
-
- After calling, the handle will be set to read at a point after the BOM
- (or at the beginning of the file if no BOM was found)
-
- If called in scalar context, unseekable handles cause a croak().
-
- If called in list context, unseekable handles will be read byte-by-byte
- and any spillage will be returned. See get_encoding_from_stream()
-
- get_encoding_from_stream
- ($encoding, $spillage) = get_encoding_from_stream(*FH);
-
- Read a BOM from an unrewindable source. This means reading the stream
- one byte at a time until either a BOM is found or every possible BOM is
- ruled out. Any non-BOM bytes read from the handle will be returned in
- $spillage.
-
- If a BOM is found and the spillage contains a partial character (judging
- by the expected character width for the encoding) more bytes will be
- read from the handle to ensure that a complete character is returned.
-
- Spillage is always in bytes, not characters.
-
- This function is less efficient than get_encoding_from_filehandle, but
- should work just as well on a seekable handle as on an unseekable one.
-
- get_encoding_from_bom
- ($encoding, $offset) = get_encoding_from_bom($string)
-
- Returns the encoding and length in bytes of the BOM in $string.
-
- If there is no BOM, an empty string is returned and $offset is zero.
-
- To get the data from the string, the following should work:
-
- use Encode;
-
- my($encoding, $offset) = get_encoding_from_bom($string);
-
- if ($encoding) {
- $string = decode($encoding, substr($string, $offset))
- }
-
-PerlIO::via interface
- File::BOM can be used as a PerlIO::via interface.
-
- open(HANDLE, '<:via(File::BOM)', 'my_file.txt');
-
- open(HANDLE, '>:encoding(UTF-16LE):via(File::BOM)', 'out_file.txt)
- print "foo\n"; # BOM is written to file here
-
- This method is less prone to errors on non-seekable files as spillage is
- incorporated into an internal buffer, but it doesn't give you any
- information about the encoding being used, or indeed whether or not a
- BOM was present.
-
- There are a few known problems with this interface, especially
- surrounding seek() and tell(), please see the BUGS section for more
- details about this.
-
- Reading
- The via(File::BOM) layer must be added before the handle is read from,
- otherwise any BOM will be missed. If there is no BOM, no decoding will
- be done.
-
- Because of a limitation in PerlIO::via, read() always works on bytes,
- not characters. BOM decoding will still be done but output will be bytes
- of UTF-8.
-
- open(BOM, '<:via(File::BOM)', $file)
- $bytes_read = read(BOM, $buffer, $length);
- $unicode = decode('UTF-8', $buffer, Encode::FB_QUIET);
-
- # Now $unicode is valid unicode and $buffer contains any left-over bytes
-
- Writing
- Add the via(File::BOM) layer on top of a unicode encoding layer to print
- a BOM at the start of the output file. This needs to be done before any
- data is written. The BOM is written as part of the first print command
- on the handle, so if you don't print anything to the handle, you won't
- get a BOM.
-
- There is a "Wide character in print" warning generated when the
- via(File::BOM) layer doesn't receive utf8 on writing. This glitch was
- resolved in perl version 5.8.7, but if your perl version is older than
- that, you'll need to make sure that the via(File::BOM) layer receives
- utf8 like this:
-
- # This works OK
- open(FH, '>:encoding(UTF-16LE):via(File::BOM):utf8', $filename)
-
- # This generates warnings with older perls
- open(FH, '>:encoding(UTF-16LE):via(File::BOM)', $filename)
-
- Seeking
- Seeking with SEEK_SET results in an offset equal to the length of any
- detected BOM being applied to the position parameter. Thus:
-
- # Seek to end of BOM (not start of file!)
- seek(FILE_BOM_HANDLE, 0, SEEK_SET)
-
- Telling
- In order to work correctly with seek(), tell() also returns a postion
- adjusted by the length of the BOM.
-
-SEE ALSO
- * Encode
- * Encode::Unicode
- * <http://www.unicode.org/unicode/faq/utf_bom.html#BOM>
-
-DIAGNOSTICS
- The following exceptions are raised via croak()
-
- * Couldn't read '<filename>': $!
- open_bom() couldn't open the given file for reading
-
- * Couldn't set binmode of handle opened on '<filename>' to '<mode>': $!
- open_bom() couldn't set the binmode of the handle
-
- * No string
- decode_from_bom called on an undefined value
-
- * Unseekable handle: $!
- get_encoding_from_filehandle() or open_bom() called on an unseekable
- file or handle in scalar context.
-
- * Couldn't read from handle: $!
- _get_encoding_seekable() couldn't read the handle. This function is
- called from get_encoding_from_filehandle(), defuse() and open_bom()
-
- * Couldn't reset read position: $!
- _get_encoding_seekable couldn't seek to the position after the BOM.
-
- * Couldn't read byte: $!
- get_encoding_from_stream couldn't read from the handle. This
- function is called from get_encoding_from_filehandle() and
- open_bom() when the handle or file is unseekable.
-
-BUGS
- Older versions of PerlIO::via have a few problems with writing, see
- above.
-
- The current version of PerlIO::via has limitations with regard to seek
- and tell, currently only line-wise seek and tell are supported by this
- module. If read() is used to read partial lines, tell() will still give
- the position of the end of the last line read.
-
- Under windows, tell() seems to return erroneously when reading files
- with unix line endings.
-
- Under windows, warnings may be generated when using the PerlIO::via
- interface to read UTF-16LE and UTF-32LE encoded files. This seems to be
- a bug in the relevant encoding(...) layers.
-
-AUTHOR
- Matt Lawrence <mattlaw at cpan.org>
-
- With thanks to Mark Fowler and Steve Purkis for additional tests and
- advice.
-
-COPYRIGHT
- Copyright 2005 Matt Lawrence, All Rights Reserved.
-
- This program is free software; you can redistribute it and/or modify it
- under the same terms as Perl itself.
-
+NAME
+ File::BOM - Utilities for handling Byte Order Marks
+
+SYNOPSIS
+ use File::BOM qw( :all )
+
+ high-level functions
+ # read a file with encoding from the BOM:
+ open_bom(FH, $file)
+ open_bom(FH, $file, ':utf8') # the same but with a default encoding
+
+ # get encoding too
+ $encoding = open_bom(FH, $file, ':utf8');
+
+ # open a potentially unseekable file:
+ ($encoding, $spillage) = open_bom(FH, $file, ':utf8');
+
+ # change encoding of an open handle according to BOM
+ $encoding = defuse(*HANDLE);
+ ($encoding, $spillage) = defuse(*HANDLE);
+
+ # Decode a string according to leading BOM:
+ $unicode = decode_from_bom($string_with_bom);
+
+ # Decode a string and get the encoding:
+ ($unicode, $encoding) = decode_from_bom($string_with_bom)
+
+ PerlIO::via interface
+ # Read the Right Thing from a unicode file with BOM:
+ open(HANDLE, '<:via(File::BOM)', $filename)
+
+ # Writing little-endian UTF-16 file with BOM:
+ open(HANDLE, '>:encoding(UTF-16LE):via(File::BOM)', $filename)
+
+ lower-level functions
+ # read BOM encoding from a filehandle:
+ $encoding = get_encoding_from_filehandle(FH)
+
+ # Get encoding even if FH is unseekable:
+ ($encoding, $spillage) = get_encoding_from_filehandle(FH);
+
+ # Get encoding from a known unseekable handle:
+ ($encdoing, $spillage) = get_encoding_from_stream(FH);
+
+ # get encoding and BOM length from BOM at start of string:
+ ($encoding, $offset) = get_encoding_from_bom($string);
+
+ variables
+ # print a BOM for a known encoding
+ print FH $enc2bom{$encoding};
+
+ # get an encoding from a known BOM
+ $enc = $bom2enc{$bom}
+
+DESCRIPTION
+ This module provides functions for handling unicode byte order marks,
+ which are to be found at the beginning of some files and streams.
+
+ For details about what a byte order mark is, see
+ <http://www.unicode.org/unicode/faq/utf_bom.html#BOM>
+
+ The intention of File::BOM is for files with BOMs to be readable as
+ seamlessly as possible, regardless of the encoding used. To that end,
+ several different interfaces are available, as shown in the synopsis
+ above.
+
+EXPORTS
+ Nothing by default.
+
+ symbols
+ * open_bom()
+
+ * defuse()
+
+ * decode_from_bom()
+
+ * get_encoding_from_filehandle()
+
+ * get_encoding_from_stream()
+
+ * get_encoding_from_bom()
+
+ * %bom2enc
+
+ * %enc2bom
+
+ tags
+ * :all
+
+ All of the above
+
+ * :subs
+
+ subroutines only
+
+ * :vars
+
+ just %bom2enc and %enc2bom
+
+VARIABLES
+ %bom2enc
+ Maps Byte Order marks to their encodings.
+
+ The keys of this hash are strings which represent the BOMs, the values
+ are their encodings, in a format which is understood by Encode
+
+ The encodings represented in this hash are: UTF-8, UTF-16BE, UTF-16LE,
+ UTF-32BE and UTF-32LE
+
+ %enc2bom
+ A reverse-lookup hash for bom2enc, with a few aliases used in Encode,
+ namely utf8, iso-10646-1 and UCS-2.
+
+ Note that UTF-16, UTF-32 and UCS-4 are not included in this hash. Mainly
+ because Encode::encode automatically puts BOMs on output. See
+ Encode::Unicode
+
+FUNCTIONS
+ open_bom
+ $encoding = open_bom(HANDLE, $filename, $default_mode)
+
+ ($encoding, $spill) = open_bom(HANDLE, $filename, $default_mode)
+
+ opens HANDLE for reading on $filename, setting the mode to the
+ appropriate encoding for the BOM stored in the file.
+
+ On failure, a fatal error is raised, see the DIAGNOSTICS section for
+ details on how to catch these. This is in order to allow the return
+ value(s) to be used for other purposes.
+
+ If the file doesn't contain a BOM, $default_mode is used instead. Hence:
+
+ open_bom(FH, 'my_file.txt', ':utf8')
+
+ Opens my_file.txt for reading in an appropriate encoding found from the
+ BOM in that file, or as a UTF-8 file if none is found.
+
+ In the absence of a $default_mode argument, the following 2 calls should
+ be equivalent:
+
+ open_bom(FH, 'no_bom.txt');
+
+ open(FH, '<', 'no_bom.txt');
+
+ If an undefined value is passed as the handle, a symbol will be
+ generated for it like open() does:
+
+ # create filehandle on the fly
+ $enc = open_bom(my $fh, $filename, ':utf8');
+ $line = <$fh>;
+
+ The filehandle will be cued up to read after the BOM. Unseekable files
+ (e.g. fifos) will cause croaking, unless called in list context to catch
+ spillage from the handle. Any spillage will be automatically decoded
+ from the encoding, if found.
+
+ e.g.
+
+ # croak if my_socket is unseekable
+ open_bom(FH, 'my_socket');
+
+ # keep spillage if my_socket is unseekable
+ ($encoding, $spillage) = open_bom(FH, 'my_socket');
+
+ # discard any spillage from open_bom
+ ($encoding) = open_bom(FH, 'my_socket');
+
+ defuse
+ $enc = defuse(FH);
+
+ ($enc, $spill) = defuse(FH);
+
+ FH should be a filehandle opened for reading, it will have the relevant
+ encoding layer pushed onto it be binmode if a BOM is found. Spillage
+ should be Unicode, not bytes.
+
+ Any uncaptured spillage will be silently lost. If the handle is
+ unseekable, use list context to avoid data loss.
+
+ If no BOM is found, the mode will be unaffected.
+
+ decode_from_bom
+ $unicode_string = decode_from_bom($string, $default, $check)
+
+ ($unicode_string, $encoding) = decode_from_bom($string, $default, $check)
+
+ Reads a BOM from the beginning of $string, decodes $string (minus the
+ BOM) and returns it to you as a perl unicode string.
+
+ if $string doesn't have a BOM, $default is used instead.
+
+ $check, if supplied, is passed to Encode::decode as the third argument.
+
+ If there's no BOM and no default, the original string is returned and
+ encoding is ''.
+
+ See Encode
+
+ get_encoding_from_filehandle
+ $encoding = get_encoding_from_filehandle(HANDLE)
+
+ ($encoding, $spillage) = get_encoding_from_filehandle(HANDLE)
+
+ Returns the encoding found in the given filehandle.
+
+ The handle should be opened in a non-unicode way (e.g. mode '<:bytes')
+ so that the BOM can be read in its natural state.
+
+ After calling, the handle will be set to read at a point after the BOM
+ (or at the beginning of the file if no BOM was found)
+
+ If called in scalar context, unseekable handles cause a croak().
+
+ If called in list context, unseekable handles will be read byte-by-byte
+ and any spillage will be returned. See get_encoding_from_stream()
+
+ get_encoding_from_stream
+ ($encoding, $spillage) = get_encoding_from_stream(*FH);
+
+ Read a BOM from an unrewindable source. This means reading the stream
+ one byte at a time until either a BOM is found or every possible BOM is
+ ruled out. Any non-BOM bytes read from the handle will be returned in
+ $spillage.
+
+ If a BOM is found and the spillage contains a partial character (judging
+ by the expected character width for the encoding) more bytes will be
+ read from the handle to ensure that a complete character is returned.
+
+ Spillage is always in bytes, not characters.
+
+ This function is less efficient than get_encoding_from_filehandle, but
+ should work just as well on a seekable handle as on an unseekable one.
+
+ get_encoding_from_bom
+ ($encoding, $offset) = get_encoding_from_bom($string)
+
+ Returns the encoding and length in bytes of the BOM in $string.
+
+ If there is no BOM, an empty string is returned and $offset is zero.
+
+ To get the data from the string, the following should work:
+
+ use Encode;
+
+ my($encoding, $offset) = get_encoding_from_bom($string);
+
+ if ($encoding) {
+ $string = decode($encoding, substr($string, $offset))
+ }
+
+PerlIO::via interface
+ File::BOM can be used as a PerlIO::via interface.
+
+ open(HANDLE, '<:via(File::BOM)', 'my_file.txt');
+
+ open(HANDLE, '>:encoding(UTF-16LE):via(File::BOM)', 'out_file.txt)
+ print "foo\n"; # BOM is written to file here
+
+ This method is less prone to errors on non-seekable files as spillage is
+ incorporated into an internal buffer, but it doesn't give you any
+ information about the encoding being used, or indeed whether or not a
+ BOM was present.
+
+ There are a few known problems with this interface, especially
+ surrounding seek() and tell(), please see the BUGS section for more
+ details about this.
+
+ Reading
+ The via(File::BOM) layer must be added before the handle is read from,
+ otherwise any BOM will be missed. If there is no BOM, no decoding will
+ be done.
+
+ Because of a limitation in PerlIO::via, read() always works on bytes,
+ not characters. BOM decoding will still be done but output will be bytes
+ of UTF-8.
+
+ open(BOM, '<:via(File::BOM)', $file)
+ $bytes_read = read(BOM, $buffer, $length);
+ $unicode = decode('UTF-8', $buffer, Encode::FB_QUIET);
+
+ # Now $unicode is valid unicode and $buffer contains any left-over bytes
+
+ Writing
+ Add the via(File::BOM) layer on top of a unicode encoding layer to print
+ a BOM at the start of the output file. This needs to be done before any
+ data is written. The BOM is written as part of the first print command
+ on the handle, so if you don't print anything to the handle, you won't
+ get a BOM.
+
+ There is a "Wide character in print" warning generated when the
+ via(File::BOM) layer doesn't receive utf8 on writing. This glitch was
+ resolved in perl version 5.8.7, but if your perl version is older than
+ that, you'll need to make sure that the via(File::BOM) layer receives
+ utf8 like this:
+
+ # This works OK
+ open(FH, '>:encoding(UTF-16LE):via(File::BOM):utf8', $filename)
+
+ # This generates warnings with older perls
+ open(FH, '>:encoding(UTF-16LE):via(File::BOM)', $filename)
+
+ Seeking
+ Seeking with SEEK_SET results in an offset equal to the length of any
+ detected BOM being applied to the position parameter. Thus:
+
+ # Seek to end of BOM (not start of file!)
+ seek(FILE_BOM_HANDLE, 0, SEEK_SET)
+
+ Telling
+ In order to work correctly with seek(), tell() also returns a postion
+ adjusted by the length of the BOM.
+
+SEE ALSO
+ * Encode
+
+ * Encode::Unicode
+
+ * <http://www.unicode.org/unicode/faq/utf_bom.html#BOM>
+
+DIAGNOSTICS
+ The following exceptions are raised via croak()
+
+ * Couldn't read '<filename>': $!
+
+ open_bom() couldn't open the given file for reading
+
+ * Couldn't set binmode of handle opened on '<filename>' to '<mode>':
+ $!
+
+ open_bom() couldn't set the binmode of the handle
+
+ * No string
+
+ decode_from_bom called on an undefined value
+
+ * Unseekable handle: $!
+
+ get_encoding_from_filehandle() or open_bom() called on an unseekable
+ file or handle in scalar context.
+
+ * Couldn't read from handle: $!
+
+ _get_encoding_seekable() couldn't read the handle. This function is
+ called from get_encoding_from_filehandle(), defuse() and open_bom()
+
+ * Couldn't reset read position: $!
+
+ _get_encoding_seekable couldn't seek to the position after the BOM.
+
+ * Couldn't read byte: $!
+
+ get_encoding_from_stream couldn't read from the handle. This
+ function is called from get_encoding_from_filehandle() and
+ open_bom() when the handle or file is unseekable.
+
+BUGS
+ Older versions of PerlIO::via have a few problems with writing, see
+ above.
+
+ The current version of PerlIO::via has limitations with regard to seek
+ and tell, currently only line-wise seek and tell are supported by this
+ module. If read() is used to read partial lines, tell() will still give
+ the position of the end of the last line read.
+
+ Under windows, tell() seems to return erroneously when reading files
+ with unix line endings.
+
+ Under windows, warnings may be generated when using the PerlIO::via
+ interface to read UTF-16LE and UTF-32LE encoded files. This seems to be
+ a bug in the relevant encoding(...) layers.
+
+AUTHOR
+ Matt Lawrence <mattlaw at cpan.org>
+
+ With thanks to Mark Fowler and Steve Purkis for additional tests and
+ advice.
+
+COPYRIGHT
+ Copyright 2005 Matt Lawrence, All Rights Reserved.
+
+ This program is free software; you can redistribute it and/or modify it
+ under the same terms as Perl itself.
+
diff --git a/lib/File/BOM.pm b/lib/File/BOM.pm
index 18d2c9b..05b071e 100644
--- a/lib/File/BOM.pm
+++ b/lib/File/BOM.pm
@@ -101,7 +101,7 @@ my @subs = qw(
my @vars = qw( %bom2enc %enc2bom );
-our $VERSION = '0.14';
+our $VERSION = '0.15';
our @EXPORT = ();
our @EXPORT_OK = ( @subs, @vars );
@@ -234,7 +234,7 @@ If the file doesn't contain a BOM, $default_mode is used instead. Hence:
Opens my_file.txt for reading in an appropriate encoding found from the BOM in
that file, or as a UTF-8 file if none is found.
-In the absense of a $default_mode argument, the following 2 calls should be equivalent:
+In the absence of a $default_mode argument, the following 2 calls should be equivalent:
open_bom(FH, 'no_bom.txt');
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libfile-bom-perl.git
More information about the Pkg-perl-cvs-commits
mailing list