fixed rt.cpan.org#113671: recognize BOM at start of a utf8 file

fixed rt.cpan.org#112857: Same Line Closing Block Breaks Parser git-svn-id: http://dev.catalyst.perl.org/repos/Config-General/trunk@110 be1acefe-a474-0410-9a34-9b3221f2030f
2025-12-16 20:21:01 +01:00 · 2016-04-18 13:09:45 +00:00
parent 34397e9b04
commit 7a48e048cf
6 changed files with 281 additions and 244 deletions
--- a/General.pm
+++ b/General.pm
@@ -38,6 +38,8 @@ use vars  qw(@ISA @EXPORT_OK);
 use base qw(Exporter);
@EXPORT_OK = qw(ParseConfig SaveConfig SaveConfigString);

+use constant _UTF8_BOM => "\x{ef}\x{bb}\x{bf}";
+
 sub new {
  #
  # create new Config::General object
@@ -407,7 +409,7 @@ sub files {

 sub _open {
  #
-  # open the config file, or expand a directory or glob
+  # open the config file, or expand a directory or glob or include
  #
  my($this, $basefile, $basepath) = @_;
  my $cont;
@@ -433,11 +435,11 @@ sub _open {
    # applied patch by AlexK fixing rt.cpan.org#41030
    if ( !@include && defined $this->{ConfigPath} ) {
      foreach my $dir (@{$this->{ConfigPath}}) {
-	my ($volume, $path, undef) = splitpath($basefile);
-	if ( -d catfile( $dir, $path )  ) {
-	  push @include, grep { -f $_ } bsd_glob(catfile($dir, $basefile), GLOB_BRACE | GLOB_QUOTE);
-	  last;
-	}
+        my ($volume, $path, undef) = splitpath($basefile);
+        if ( -d catfile( $dir, $path )  ) {
+          push @include, grep { -f $_ } bsd_glob(catfile($dir, $basefile), GLOB_BRACE | GLOB_QUOTE);
+          last;
+        }
      }
    }

@@ -449,7 +451,7 @@ sub _open {
      # include foo/* shouldn't fail if there isn't anything matching)
      # rt.cpan.org#79869: local $this->{IncludeGlob};
      for (@include) {
-	$this->_open($_);
+        $this->_open($_);
      }
      return;
    }
@@ -460,11 +462,11 @@ sub _open {
    if (defined $this->{ConfigPath}) {
      # try to find the file within ConfigPath
      foreach my $dir (@{$this->{ConfigPath}}) {
-	if( -e catfile($dir, $basefile) ) {
-	  $configfile = catfile($dir, $basefile);
-	  $found = 1;
-	  last; # found it
-	}
+        if( -e catfile($dir, $basefile) ) {
+          $configfile = catfile($dir, $basefile);
+          $found = 1;
+          last; # found it
+        }
      }
    }
    if (!$found) {
@@ -490,16 +492,9 @@ sub _open {
      my $file = catfile($configfile, $_);
      if (! exists $this->{files}->{$file} or $this->{IncludeAgain} ) {
        # support re-read if used urged us to do so, otherwise ignore the file
-	if ($this->{UTF8}) {
-	  $fh = IO::File->new;
-	  open( $fh, "<:utf8", $file)
-	    or croak "Config::General: Could not open $file in UTF8 mode!($!)\n";
-	}
-	else {
-	  $fh = IO::File->new( $file, 'r') or croak "Config::General: Could not open $file!($!)\n";
-	}
-	$this->{files}->{"$file"} = 1;
-	$this->_read($fh);
+        $fh = $this->_openfile_for_read($file);
+        $this->{files}->{"$file"} = 1;
+        $this->_read($fh);
      }
      else {
        warn "File $file already loaded.  Use -IncludeAgain to load it again.\n";
@@ -516,16 +511,7 @@ sub _open {
      return;
    }
    else {
-      if ($this->{UTF8}) {
-	$fh = IO::File->new;
-	open( $fh, "<:utf8", $configfile)
-	  or croak "Config::General: Could not open $configfile in UTF8 mode!($!)\n";
-      }
-      else {
-	$fh = IO::File->new( "$configfile", 'r')
-	  or croak "Config::General: Could not open $configfile!($!)\n";
-      }
-
+      $fh = $this->_openfile_for_read($configfile);
      $this->{files}->{$configfile}    = 1;

      my ($volume, $path, undef)           = splitpath($configfile);
@@ -538,6 +524,30 @@ sub _open {
 }


+sub _openfile_for_read {
+  #
+  # actually open a file, turn on utf8 mode if requested by bom
+  #
+  my ($this, $file) = @_;
+
+  my $fh = IO::File->new( $file, 'r')
+    or croak "Config::General: Could not open $file!($!)\n";
+
+  # attempt to read an initial utf8 byte-order mark (BOM)
+  my $n_read  = sysread $fh, my $read_BOM, length(_UTF8_BOM);
+  my $has_BOM = $n_read == length(_UTF8_BOM) && $read_BOM eq _UTF8_BOM;
+
+  # set utf8 perlio layer if BOM was found or if option -UTF8 is turned on
+  binmode $fh, ":utf8" if $this->{UTF8} || $has_BOM;
+
+  # rewind to beginning of file if we read chars that were not the BOM
+  sysseek $fh, 0, 0 if $n_read && !$has_BOM;
+
+  return $fh;
+}
+
+
+
 sub _read {
  #
  # store the config contents in @content
@@ -713,7 +723,7 @@ sub _read {
      my $incl_file;
      my $path = '';
      if ( $this->{IncludeRelative} and defined $this->{CurrentConfigFilePath}) {
-      	$path = $this->{CurrentConfigFilePath};
+        $path = $this->{CurrentConfigFilePath};
      }
      elsif (defined $this->{ConfigPath}) {
        # fetch pathname of base config file, assuming the 1st one is the path of it
@@ -2113,82 +2123,86 @@ The method B<getall> returns a hash of all values.

 You can define a B<block> of options. A B<block> looks much like a block
 in the wellknown Apache config format. It starts with E<lt>B<blockname>E<gt> and ends
-with E<lt>/B<blockname>E<gt>. An example:
+with E<lt>/B<blockname>E<gt>.

- <database>
-    host   = muli
-    user   = moare
-    dbname = modb
-    dbpass = D4r_9Iu
- </database>
+A block start and end cannot be on the same line.
+
+An example:
+
+<database>
+host   = muli
+user   = moare
+dbname = modb
+dbpass = D4r_9Iu
+</database>

 Blocks can also be nested. Here is a more complicated example:

- user   = hans
- server = mc200
- db     = maxis
- passwd = D3rf$
- <jonas>
-        user    = tom
-        db      = unknown
-        host    = mila
-        <tablestructure>
-                index   int(100000)
-                name    char(100)
-                prename char(100)
-                city    char(100)
-                status  int(10)
-                allowed moses
-                allowed ingram
-                allowed joice
-        </tablestructure>
- </jonas>
+user   = hans
+server = mc200
+db     = maxis
+passwd = D3rf$
+<jonas>
+user    = tom
+db      = unknown
+host    = mila
+<tablestructure>
+index   int(100000)
+name    char(100)
+prename char(100)
+city    char(100)
+status  int(10)
+allowed moses
+allowed ingram
+allowed joice
+</tablestructure>
+</jonas>

 The hash which the method B<getall> returns look like that:

- print Data::Dumper(\%hash);
- $VAR1 = {
-          'passwd' => 'D3rf$',
-          'jonas'  => {
-                       'tablestructure' => {
-                                             'prename' => 'char(100)',
-                                             'index'   => 'int(100000)',
-                                             'city'    => 'char(100)',
-                                             'name'    => 'char(100)',
-                                             'status'  => 'int(10)',
-                                             'allowed' => [
-                                                            'moses',
-                                                            'ingram',
-                                                            'joice',
-                                                          ]
-                                           },
-                       'host'           => 'mila',
-                       'db'             => 'unknown',
-                       'user'           => 'tom'
-                     },
-          'db'     => 'maxis',
-          'server' => 'mc200',
-          'user'   => 'hans'
-        };
+print Data::Dumper(\%hash);
+$VAR1 = {
+'passwd' => 'D3rf$',
+'jonas'  => {
+'tablestructure' => {
+'prename' => 'char(100)',
+'index'   => 'int(100000)',
+'city'    => 'char(100)',
+'name'    => 'char(100)',
+'status'  => 'int(10)',
+'allowed' => [
+'moses',
+'ingram',
+'joice',
+]
+},
+'host'           => 'mila',
+'db'             => 'unknown',
+'user'           => 'tom'
+},
+'db'     => 'maxis',
+'server' => 'mc200',
+'user'   => 'hans'
+};

 If you have turned on B<-LowerCaseNames> (see new()) then blocks as in the
 following example:

- <Dir>
-   <AttriBUTES>
-     Owner  root
-   </attributes>
- </dir>
+<Dir>
+<AttriBUTES>
+Owner  root
+</attributes>
+</dir>

 would produce the following hash structure:

- $VAR1 = {
-          'dir' => {
-                    'attributes' => {
-                                     'owner  => "root",
-                                    }
-                   }
-         };
+$VAR1 = {
+'dir' => {
+'attributes' => {
+'owner  => "root",
+}
+}
+};

 As you can see, the keys inside the config hash are normalized.

@@ -2210,29 +2224,29 @@ create a hashref with the left part of the named block as the key containing
 one or more hashrefs with the right part of the block as key containing everything
 inside the block(which may again be nested!). As examples says more than words:

- # given the following sample
- <Directory /usr/frisco>
-        Limit Deny
-        Options ExecCgi Index
- </Directory>
- <Directory /usr/frik>
-        Limit DenyAll
-        Options None
- </Directory>
+# given the following sample
+<Directory /usr/frisco>
+Limit Deny
+Options ExecCgi Index
+</Directory>
+<Directory /usr/frik>
+Limit DenyAll
+Options None
+</Directory>

- # you will get:
- $VAR1 = {
-          'Directory' => {
-                           '/usr/frik' => {
-                                            'Options' => 'None',
-                                            'Limit' => 'DenyAll'
-                                          },
-                           '/usr/frisco' => {
-                                              'Options' => 'ExecCgi Index',
-                                              'Limit' => 'Deny'
-                                            }
-                         }
-        };
+# you will get:
+$VAR1 = {
+'Directory' => {
+'/usr/frik' => {
+'Options' => 'None',
+'Limit' => 'DenyAll'
+},
+'/usr/frisco' => {
+'Options' => 'ExecCgi Index',
+'Limit' => 'Deny'
+}
+}
+};

 You cannot have more than one named block with the same name because it will
 be stored in a hashref and therefore be overwritten if a block occurs once more.
@@ -2248,28 +2262,28 @@ Sometimes you may need blocknames which have whitespace in their names.
 With named blocks this is no problem, as the module only looks for the
 first whitespace:

- <person hugo gera>
- </person>
+<person hugo gera>
+</person>

 would be parsed to:

- $VAR1 = {
-          'person' => {
-                       'hugo gera' => {
-                                      },
-                      }
-         };
+$VAR1 = {
+'person' => {
+'hugo gera' => {
+},
+}
+};

 The problem occurs, if you want to have a simple block containing whitespace:

- <hugo gera>
- </hugo gera>
+<hugo gera>
+</hugo gera>

 This would be parsed as a named block, which is not what you wanted. In this
 very case you may use quotation marks to indicate that it is not a named block:

- <"hugo gera">
- </"hugo gera">
+<"hugo gera">
+</"hugo gera">

 The save() method of the module inserts automatically quotation marks in such
 cases.
@@ -2283,12 +2297,12 @@ explicit empty blocks.
 Normally you would write this in your config to define an empty
 block:

- <driver Apache>
- </driver>
+<driver Apache>
+</driver>

 To save writing you can also write:

- <driver Apache/>
+<driver Apache/>

 which is the very same as above. This works for normal blocks and
 for named blocks.
@@ -2300,45 +2314,45 @@ for named blocks.
 You may have more than one line of the same option with different values.

 Example:
- log  log1
- log  log2
- log  log2
+log  log1
+log  log2
+log  log2

 You will get a scalar if the option occurred only once or an array if it occurred
 more than once. If you expect multiple identical options, then you may need to
 check if an option occurred more than once:

- $allowed = $hash{jonas}->{tablestructure}->{allowed};
- if(ref($allowed) eq "ARRAY") {
-     @ALLOWED = @{$allowed};
- else {
-     @ALLOWED = ($allowed);
- }
+$allowed = $hash{jonas}->{tablestructure}->{allowed};
+if(ref($allowed) eq "ARRAY") {
+@ALLOWED = @{$allowed};
+else {
+@ALLOWED = ($allowed);
+}

 The same applies to blocks and named blocks too (they are described in more detail
 below). For example, if you have the following config:

- <dir blah>
-   user max
- </dir>
- <dir blah>
-   user hannes
- </dir>
+<dir blah>
+user max
+</dir>
+<dir blah>
+user hannes
+</dir>

 then you would end up with a data structure like this:

- $VAR1 = {
-          'dir' => {
-                    'blah' => [
-                                {
-                                  'user' => 'max'
-                                },
-                                {
-                                  'user' => 'hannes'
-                                }
-                              ]
-                    }
-          };
+$VAR1 = {
+'dir' => {
+'blah' => [
+{
+'user' => 'max'
+},
+{
+'user' => 'hannes'
+}
+]
+}
+};

 As you can see, the two identical blocks are stored in a hash which contains
 an array(-reference) of hashes.
@@ -2349,16 +2363,16 @@ both identical blocks, then you need to turn the B<new()> parameter B<-MergeDupl
 on (see above). The parsed structure of the example above would then look like
 this:

- $VAR1 = {
-          'dir' => {
-                    'blah' => {
-			       'user' => [
-					   'max',
-					   'hannes'
-					 ]
-                              }
-                    }
-          };
+$VAR1 = {
+'dir' => {
+'blah' => {
+'user' => [
+'max',
+'hannes'
+]
+}
+}
+};

 As you can see, there is only one hash "dir->{blah}" containing multiple
 "user" entries. As you can also see, turning on  B<-MergeDuplicateBlocks>
@@ -2376,7 +2390,7 @@ You may also force a single config line to get parsed into an array by
 turning on the option B<-ForceArray> and by surrounding the value of the
 config entry by []. Example:

- hostlist = [ foo.bar ]
+hostlist = [ foo.bar ]

 Will be a singlevalue array entry if the option is turned on. If you want
 it to remain to be an array you have to turn on B<-ForceArray> during save too.
@@ -2390,11 +2404,11 @@ of the line. The Config::General module will concatenate those lines to one sing
 Example:

 command = cat /var/log/secure/tripwire | \
-           mail C<-s> "report from tripwire" \
-           honey@myotherhost.nl
+mail C<-s> "report from tripwire" \
+honey@myotherhost.nl

 command will become:
- "cat /var/log/secure/tripwire | mail C<-s> 'report from twire' honey@myotherhost.nl"
+"cat /var/log/secure/tripwire | mail C<-s> 'report from twire' honey@myotherhost.nl"


 =head1 HERE DOCUMENTS
@@ -2405,12 +2419,12 @@ identifier must follow a "<<".

 Example:

- message <<EOF
-   we want to
-   remove the
-   homedir of
-   root.
- EOF
+message <<EOF
+we want to
+remove the
+homedir of
+root.
+EOF

 Everything between the two "EOF" strings will be in the option I<message>.

@@ -2421,19 +2435,19 @@ amount of spaces from every line inside the here-document.

 Example:

- message <<EOF
-         we want to
-         remove the
-         homedir of
-         root.
-      EOF
+message <<EOF
+we want to
+remove the
+homedir of
+root.
+EOF

 After parsing, message will become:

-   we want to
-   remove the
-   homedir of
-   root.
+we want to
+remove the
+homedir of
+root.

 because there were the string "     " in front of EOF, which were cut from every
 line inside the here-document.
@@ -2445,12 +2459,12 @@ line inside the here-document.
 You can include an external file at any position in your config file using the following statement
 in your config file:

- <<include externalconfig.rc>>
+<<include externalconfig.rc>>

 If you turned on B<-UseApacheInclude> (see B<new()>), then you can also use the following
 statement to include an external file:

- include externalconfig.rc
+include externalconfig.rc

 This file will be inserted at the position where it was found as if the contents of this file
 were directly at this position.
@@ -2464,22 +2478,22 @@ working directory. Under some circumstances it maybe possible to
 open included files from the directory, where the configfile resides. You need to turn on
 the option B<-IncludeRelative> (see B<new()>) if you want that. An example:

- my $conf = Config::General(
-                             -ConfigFile => "/etc/crypt.d/server.cfg"
-                             -IncludeRelative => 1
-                           );
+my $conf = Config::General(
+-ConfigFile => "/etc/crypt.d/server.cfg"
+-IncludeRelative => 1
+);

- /etc/crypt.d/server.cfg:
-  <<include acl.cfg>>
+/etc/crypt.d/server.cfg:
+<<include acl.cfg>>

 In this example Config::General will try to include I<acl.cfg> from I</etc/crypt.d>:

- /etc/crypt.d/acl.cfg
+/etc/crypt.d/acl.cfg

 The default behavior (if B<-IncludeRelative> is B<not> set!) will be to open just I<acl.cfg>,
 wherever it is, i.e. if you did a chdir("/usr/local/etc"), then Config::General will include:

- /usr/local/etc/acl.cfg
+/usr/local/etc/acl.cfg

 Include statements can be case insensitive (added in version 1.25).

@@ -2492,21 +2506,21 @@ so make sure, you're not including the same file from within itself!

 Example:

-    # main.cfg
-    <object billy>
-        class=Some::Class
-        <printers>
-            include printers.cfg
-        </printers>
-        # ...
-    </object>
-    <object bob>
-        class=Another::Class
-        <printers>
-            include printers.cfg
-        </printers>
-        # ...
-    </object>
+# main.cfg
+<object billy>
+class=Some::Class
+<printers>
+include printers.cfg
+</printers>
+# ...
+</object>
+<object bob>
+class=Another::Class
+<printers>
+include printers.cfg
+</printers>
+# ...
+</object>

 Now C<printers.cfg> will be include in both the C<billy> and C<bob> objects.

@@ -2522,18 +2536,18 @@ tab stops in front of the #.

 A comment can also occur after a config statement. Example:

- username = max  # this is the comment
+username = max  # this is the comment

 If you want to comment out a large block you can use C-style comments. A B</*> signals
 the begin of a comment block and the B<*/> signals the end of the comment block.
 Example:

- user  = max # valid option
- db    = tothemax
- /*
- user  = andors
- db    = toand
- */
+user  = max # valid option
+db    = tothemax
+/*
+user  = andors
+db    = toand
+*/

 In this example the second options of user and db will be ignored. Please beware of the fact,
 if the Module finds a B</*> string which is the start of a comment block, but no matching
@@ -2542,7 +2556,7 @@ end block, it will ignore the whole rest of the config file!
 B<NOTE:> If you require the B<#> character (number sign) to remain in the option value, then
 you can use a backslash in front of it, to escape it. Example:

- bgcolor = \#ffffcc
+bgcolor = \#ffffcc

 In this example the value of $config{bgcolor} will be "#ffffcc", Config::General will not treat
 the number sign as the begin of a comment because of the leading backslash.
@@ -2558,25 +2572,25 @@ and parsing.

 The general aproach works like this:

- sub ck {
-  my($file, $base) = @_;
-  print "_open() tries $file ... ";
-  if($file =~ /blah/) {
-    print "ignored\n";
-    return (0);
-  }
-  else {
-    print "allowed\n";
-    return (1, @_);
-  }
- }
+sub ck {
+my($file, $base) = @_;
+print "_open() tries $file ... ";
+if($file =~ /blah/) {
+print "ignored\n";
+return (0);
+}
+else {
+print "allowed\n";
+return (1, @_);
+}
+}
 
- my %c = ParseConfig(
-              -IncludeGlob => 1,
-              -UseApacheInclude => 1,
-              -ConfigFile => shift,
-              -Plug => { pre_open => *ck }
- );
+my %c = ParseConfig(
+-IncludeGlob => 1,
+-UseApacheInclude => 1,
+-ConfigFile => shift,
+-Plug => { pre_open => *ck }
+);
 
 Output:

@@ -2746,7 +2760,7 @@ I recommend you to read the following documents, which are supplied with Perl:

 =head1 LICENSE AND COPYRIGHT

-Copyright (c) 2000-2015 Thomas Linden
+Copyright (c) 2000-2016 Thomas Linden

 This library is free software; you can redistribute it and/or
 modify it under the same terms as Perl itself.