# # Copyleft (l) 2000-2016 Thomas v.D. . # # leo may be # used and distributed under the terms of the GNU General Public License. # All other brand and product names are trademarks, registered trademarks # or service marks of their respective holders. package WWW::Dict::Leo::Org; $WWW::Dict::Leo::Org::VERSION = "1.41"; use strict; use warnings; use English '-no_match_vars'; use Carp::Heavy; use Carp; use IO::Socket; use MIME::Base64; use HTML::TableParser; sub debug; sub new { my ($class, %param) = @_; my $type = ref( $class ) || $class; my %settings = ( "-Host" => "pda.leo.org", "-Port" => 80, "-UserAgent" => "Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.8.1.9) Gecko/20071025 Firefox/2.0.0.9", "-Proxy" => "", "-ProxyUser" => "", "-ProxyPass" => "", "-Debug" => 0, "-SpellTolerance" => "standard", # on, off "-Morphology" => "standard", # none, forcedAll "-CharTolerance" => "relaxed", # fuzzy, exact "-Language" => "en", # en2de, de2fr, fr2de, de2es, es2de "data" => {}, # the results "section" => [], "title" => "", "segments" => [], "Maxsize" => 0, "Linecount" => 0, ); foreach my $key (keys %param) { $settings{$key} = $param{$key}; # override defaults } my $self = \%settings; bless $self, $type; return $self; } sub translate { my($this, $term) = @_; my $linecount = 0; my $maxsize = 0; my @match = (); # # form var transitions for searchLoc(=translation direction) and lp(=language) my %lang = ( speak => "ende" ); my @langs = qw(en es ru pt fr pl ch it); if ($this->{"-Language"}) { # en | fr | ru2en | de2pl etc # de2, 2de, de are not part of lang spec if (! grep { $this->{"-Language"} =~ /$_/ } @langs) { croak "Unsupported language: " . $this->{"-Language"}; } my $spec = $this->{"-Language"}; my $l; if ($spec =~ /(..)2de/) { $l = $1; $this->{"-Language"} = -1; $lang{speak} = "${l}de"; } elsif ($spec =~ /de2(..)/) { $l = $1; $this->{"-Language"} = 1; $lang{speak} = "${l}de"; } else { $lang{speak} = $this->{"-Language"} . 'de'; $this->{"-Language"} = 0; } } # # cut invalid values for parameters or set defaults if unspecified # my %form = ( spellToler => { mask => [ qw(standard on off) ], val => $this->{"-SpellTolerance"} || "standard" }, deStem => { mask => [ qw(standard none forcedAll) ], val => $this->{"-Morphology"} || "standard" }, cmpType => { mask => [ qw(fuzzy exact relaxed) ], val => $this->{"-CharTolerance"} || "relaxed" }, searchLoc => { mask => [ qw(-1 0 1) ], val => $this->{"-Language"} || "0" }, ); my @form; foreach my $var (keys %form) { if (grep { $form{$var}->{val} eq $_ } @{$form{$var}->{mask}}) { push @form, $var . "=" . $form{$var}->{val}; } } # add language push @form, "lp=$lang{speak}"; # # process whitespaces # my $query = $term; $query =~ s/\s\s*/ /g; $query =~ s/\s/\+/g; push @form, "search=$query"; # # make the query cgi'ish # my $form = join "&", @form; # store for result caching $this->{Form} = $form; # # check for proxy settings and use it if exists # otherwise use direct connection # my ($url, $site); my $ip = $this->{"-Host"}; my $port = $this->{"-Port"}; my $proxy_user = $this->{"-ProxyUser"}; my $proxy_pass = $this->{"-ProxyPass"}; if ($this->{"-Proxy"}) { my $proxy = $this->{"-Proxy"}; $proxy =~ s/^http:\/\///i; if ($proxy =~ /^(.+):(.+)\@(.*)$/) { # proxy user account $proxy_user = $1; $proxy_pass = $2; $proxy = $3; $this->debug( "proxy_user: $proxy_user"); } my($host, $pport) = split /:/, $proxy; if ($pport) { $url = "http://$ip:$port/dictQuery/m-vocab/$lang{speak}/de.html"; $port = $pport; } else { $port = 80; } $ip = $host; $this->debug( "connecting to proxy:", $ip, $port); } else { $this->debug( "connecting to site:", $ip, "port", $port); $url = "/dictQuery/m-vocab/$lang{speak}/de.html"; } my $conn = new IO::Socket::INET( Proto => "tcp", PeerAddr => $ip, PeerPort => $port, ) or die "Unable to connect to $ip:$port: $!\n"; $conn->autoflush(1); $this->debug( "GET $url?$form HTTP/1.0"); print $conn "GET $url?$form HTTP/1.0\r\n"; # be nice, simulate Konqueror. print $conn qq($this->{"-UserAgent"} Host: $this->{"-Host"}:$this->{"-Port"} Accept: text/*;q=1.0, image/png;q=1.0, image/jpeg;q=1.0, image/gif;q=1.0, image/*;q=0.8, */*;q=0.5 Accept-Charset: iso-8859-1;q=1.0, *;q=0.9, utf-8;q=0.8 Accept-Language: en_US, en\r\n); if ($this->{"-Proxy"} and $proxy_user) { # authenticate # construct the auth header my $coded = encode_base64("$proxy_user:$proxy_pass"); $this->debug( "Proxy-Authorization: Basic $coded"); print $conn "Proxy-Authorization: Basic $coded\r\n"; } # finish the request print $conn "\r\n"; # # parse dict.leo.org output # my @line = <$conn>; close $conn or die "Connection failed: $!\n"; $this->debug( "connection: done"); $site = join "", @line; if ($site !~ /HTTP\/1\.(0|1) 200 OK/i) { if ($site =~ /HTTP\/1\.(0|1) (\d+) /i) { # got HTTP error my $err = $2; if ($err == 407) { croak "proxy auth required or access denied!\n"; } else { if ($site =~ /Leider konnten wir zu Ihrem Suchbegriff/ || $site =~ /found no matches for your search/ ) { return (); } else { croak "got HTTP error $err!\n"; } } } } my @request = ( { id => 2, row => sub { $this->row(@_); }, hdr => sub { $this->hdr(@_); } }, { id => 3, hdr => sub { $this->hdr(@_); }, row => sub { $this->row(@_); } }, { id => 4, hdr => sub { $this->hdr(@_); }, row => sub { $this->row(@_); } } ); $this->{Linecount} = 0; my $p = HTML::TableParser->new( \@request, { Decode => 1, Trim => 1, Chomp => 1, DecodeNBSP => 1 } ); $site=~s/ /\ \;/g; $p->parse($site); # put the rest on the stack, if any if (@{$this->{section}}) { $this->{data}->{ $this->{title} } = $this->{section}; push @{$this->{segments}}, $this->{title}; } # put back in order my @matches; foreach my $title (@{$this->{segments}}) { push @matches, { title => $title, data => $this->{data}->{$title} }; } return @matches; } sub hdr { my ( $this, $tbl_id, $line_no, $data, $udata ) = @_; if ($data->[1] && $data->[0] eq $data->[1]) { $this->debug("Probable start of a new section: $data->[1]"); if (@{$this->{section}}) { $this->{data}->{ $this->{title} } = $this->{section}; push @{$this->{segments}}, $this->{title}; } $this->{title} = $data->[1]; $this->{section} = []; } } sub row { # # divide rows into titles and lang data. # we get 2 items (left and right column), if they # are equal, it's a segment title, otherwise it's # segment content. left columns ending in HH:MM # are forumposts and ignored as well as rows with # empty left cells. my ( $this, $tbl_id, $line_no, $data, $udata ) = @_; if ($data->[1] && $data->[0] && $data->[0] ne $data->[1] && $data->[0] !~ /\d{2}:\d{2}$/) { if (length($data->[0]) > $this->{Maxsize}) { $this->{Maxsize} = length($data->[0]); } $this->debug("line: $line_no, left: $data->[0], right: $data->[1]"); push @{$this->{section}}, { left => $data->[0], right => $data->[1] }; $this->{Linecount}++; } } sub maxsize { my($this) = @_; return $this->{Maxsize}; } sub lines { my($this) = @_; return $this->{Linecount}; } sub form { my($this) = @_; return $this->{Form}; } sub debug { my($this, $msg) = @_; if ($this->{"-Debug"}) { print STDERR "%DEBUG: $msg\n"; } } 1; =encoding ISO8859-1 =head1 NAME WWW::Dict::Leo::Org - Interface module to dictionary dict.leo.org =head1 SYNOPSIS use WWW::Dict::Leo::Org; my $leo = new WWW::Dict::Leo::Org(); my @matches = $leo->translate($term); =head1 DESCRIPTION B is a module which connects to the website B and translates the given term. It returns an array of hashes. Each hash contains a left side and a right side of the result entry. =head1 OPTIONS B has several parameters, which can be supplied as a hash. All parameters are optional. =over =item I<-Host> The hostname of the dict website to use. For the moment only dict.leo.org is supported, which is also the default - therefore changing the hostname would not make much sense. =item I<-Port> The tcp port to use for connecting, the default is 80, you shouldn't change it. =item I<-UserAgent> The user-agent to send to dict.leo.org site. Currently this is the default: Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.8.1.9) Gecko/20071025 Firefox/2.0.0.9 =item I<-Proxy> Fully qualified proxy server. Specify as you would do in the well known environment variable B, example: -Proxy => "http://192.168.1.1:3128" =item I<-ProxyUser> I<-ProxyPass> If your proxy requires authentication, use these parameters to specify the credentials. =item I<-Debug> If enabled (set to 1), prints a lot of debug information to stderr, normally only required for developers or to report bugs (see below). =back Parameters to control behavior of dict.leo.org: =over =item I<-SpellTolerance> Be tolerant to spelling errors. Default: turned on. Possible values: on, off. =item I<-Morphology> Provide morphology information. Default: standard. Possible values: standard, none, forcedAll. =item I<-CharTolerance> Allow umlaut alternatives. Default: relaxed. Possible values: fuzzy, exact, relaxed. =item I<-Language> Translation direction. Please note that dict.leo.org always translates either to or from german. The following languages are supported: english, polish, spanish, portugese russian and chinese. You can specify only the country code, or append B in order to force translation to german, or preprend B in order to translate to the other language. Valid examples: ru to or from russian de2pl to polish es2de spanish to german Valid country codes: en english es spanish ru russian pt portugese pl polish ch chinese Default: B. =back =head1 METHODS =head2 translate($term) Use this method after initialization to connect to dict.leo.org and translate the given term. It returns an array of hashes containing the actual results. use WWW::Dict::Leo::Org; use Data::Dumper; my $leo = new WWW::Dict::Leo::Org(); my @matches = $leo->translate("test"); print Dumper(\@matches); which prints: $VAR1 = [ { 'data' => [ { 'left' => 'check', 'right' => 'der Test' }, { 'left' => 'quiz (Amer.)', 'right' => 'der Test    [Schule]' ], 'title' => 'Unmittelbare Treffer' }, { 'data' => [ { 'left' => 'to fail a test', 'right' => 'einen Test nicht bestehen' }, { 'left' => 'to test', 'right' => 'Tests macheneinen Test machen' } ], 'title' => 'Verben und Verbzusammensetzungen' }, 'data' => [ { 'left' => 'testing  adj.', 'right' => 'im Test' } ], 'title' => 'Wendungen und Ausdrücke' } ]; You might take a look at the B script how to process this data. =head2 maxsize() Returns the size of the largest returned term (left side). =head2 lines() Returns the number of translation results. =head2 form() Returns the submitted form uri. =head1 SEE ALSO L =head1 COPYRIGHT WWW::Dict::Leo::Org - Copyright (c) 2007-2016 by Thomas v.D. L - Copyright (c) 1995-2016 LEO Dictionary Team. =head1 AUTHOR Thomas v.D. =head1 HOW TO REPORT BUGS Use L to report bugs, select the queue for B. Please don't forget to add debugging output! =head1 VERSION 1.41 =cut