mirror of
https://codeberg.org/scip/leo.git
synced 2025-12-16 20:21:03 +01:00
Merge pull request #1 from rohieb/xml-api
Implement new XML API [draft]
This commit is contained in:
@@ -12,7 +12,7 @@ WriteMakefile(
|
|||||||
'PREREQ_PM' => { 'Carp::Heavy' => 0,
|
'PREREQ_PM' => { 'Carp::Heavy' => 0,
|
||||||
'IO::Socket' => 0,
|
'IO::Socket' => 0,
|
||||||
'MIME::Base64' => 0,
|
'MIME::Base64' => 0,
|
||||||
'HTML::TableParser' => 0
|
'XML::Simple' => 0
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|||||||
207
Org.pm
207
Org.pm
@@ -16,7 +16,8 @@ use Carp::Heavy;
|
|||||||
use Carp;
|
use Carp;
|
||||||
use IO::Socket;
|
use IO::Socket;
|
||||||
use MIME::Base64;
|
use MIME::Base64;
|
||||||
use HTML::TableParser;
|
use XML::Simple;
|
||||||
|
use Encode;
|
||||||
|
|
||||||
sub debug;
|
sub debug;
|
||||||
|
|
||||||
@@ -25,7 +26,7 @@ sub new {
|
|||||||
my $type = ref( $class ) || $class;
|
my $type = ref( $class ) || $class;
|
||||||
|
|
||||||
my %settings = (
|
my %settings = (
|
||||||
"-Host" => "pda.leo.org",
|
"-Host" => "dict.leo.org",
|
||||||
"-Port" => 80,
|
"-Port" => 80,
|
||||||
"-UserAgent" => "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
|
"-UserAgent" => "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
|
||||||
"-Proxy" => "",
|
"-Proxy" => "",
|
||||||
@@ -94,23 +95,8 @@ sub translate {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#
|
|
||||||
# cut invalid values for parameters or set defaults if unspecified
|
|
||||||
#
|
|
||||||
my %form = (
|
|
||||||
spellToler => { mask => [ qw(standard on off) ], val => $this->{"-SpellTolerance"} || "standard" },
|
|
||||||
deStem => { mask => [ qw(standard none forcedAll) ], val => $this->{"-Morphology"} || "standard" },
|
|
||||||
cmpType => { mask => [ qw(fuzzy exact relaxed) ], val => $this->{"-CharTolerance"} || "relaxed" },
|
|
||||||
searchLoc => { mask => [ qw(-1 0 1) ], val => $this->{"-Language"} || "0" },
|
|
||||||
);
|
|
||||||
my @form;
|
|
||||||
foreach my $var (keys %form) {
|
|
||||||
if (grep { $form{$var}->{val} eq $_ } @{$form{$var}->{mask}}) {
|
|
||||||
push @form, $var . "=" . $form{$var}->{val};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# add language
|
# add language
|
||||||
|
my @form;
|
||||||
push @form, "lp=$lang{speak}";
|
push @form, "lp=$lang{speak}";
|
||||||
|
|
||||||
#
|
#
|
||||||
@@ -151,7 +137,7 @@ sub translate {
|
|||||||
}
|
}
|
||||||
my($host, $pport) = split /:/, $proxy;
|
my($host, $pport) = split /:/, $proxy;
|
||||||
if ($pport) {
|
if ($pport) {
|
||||||
$url = "http://$ip:$port/dictQuery/m-vocab/$lang{speak}/de.html";
|
$url = "http://$ip:$port/dictQuery/m-vocab/$lang{speak}/query.xml";
|
||||||
$port = $pport;
|
$port = $pport;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
@@ -162,7 +148,7 @@ sub translate {
|
|||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
$this->debug( "connecting to site:", $ip, "port", $port);
|
$this->debug( "connecting to site:", $ip, "port", $port);
|
||||||
$url = "/dictQuery/m-vocab/$lang{speak}/de.html";
|
$url = "/dictQuery/m-vocab/$lang{speak}/query.xml";
|
||||||
}
|
}
|
||||||
|
|
||||||
my $conn = new IO::Socket::INET(
|
my $conn = new IO::Socket::INET(
|
||||||
@@ -197,107 +183,96 @@ Accept-Language: en_US, en\r\n);
|
|||||||
#
|
#
|
||||||
# parse dict.leo.org output
|
# parse dict.leo.org output
|
||||||
#
|
#
|
||||||
my @line = <$conn>;
|
$site = "";
|
||||||
close $conn or die "Connection failed: $!\n";
|
my $got_headers = 0;
|
||||||
$this->debug( "connection: done");
|
while (<$conn>) {
|
||||||
|
if ($got_headers) {
|
||||||
$site = join "", @line;
|
$site .= $_;
|
||||||
|
}
|
||||||
if ($site !~ /HTTP\/1\.(0|1) 200 OK/i) {
|
elsif (/^\r?$/) {
|
||||||
if ($site =~ /HTTP\/1\.(0|1) (\d+) /i) {
|
$got_headers = 1;
|
||||||
# got HTTP error
|
}
|
||||||
my $err = $2;
|
elsif ($_ !~ /HTTP\/1\.(0|1) 200 OK/i) {
|
||||||
if ($err == 407) {
|
if (/HTTP\/1\.(0|1) (\d+) /i) {
|
||||||
croak "proxy auth required or access denied!\n";
|
# got HTTP error
|
||||||
}
|
my $err = $2;
|
||||||
else {
|
if ($err == 407) {
|
||||||
if ($site =~ /Leider konnten wir zu Ihrem Suchbegriff/ ||
|
croak "proxy auth required or access denied!\n";
|
||||||
$site =~ /found no matches for your search/
|
close $conn;
|
||||||
) {
|
|
||||||
return ();
|
return ();
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
croak "got HTTP error $err!\n";
|
croak "got HTTP error $err!\n";
|
||||||
|
close $conn;
|
||||||
|
return ();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
my @request = (
|
close $conn or die "Connection failed: $!\n";
|
||||||
{
|
$this->debug( "connection: done");
|
||||||
id => 2,
|
|
||||||
row => sub { $this->row(@_); },
|
|
||||||
hdr => sub { $this->hdr(@_); }
|
|
||||||
},
|
|
||||||
{
|
|
||||||
id => 3,
|
|
||||||
hdr => sub { $this->hdr(@_); },
|
|
||||||
row => sub { $this->row(@_); }
|
|
||||||
},
|
|
||||||
{
|
|
||||||
id => 4,
|
|
||||||
hdr => sub { $this->hdr(@_); },
|
|
||||||
row => sub { $this->row(@_); }
|
|
||||||
}
|
|
||||||
);
|
|
||||||
$this->{Linecount} = 0;
|
|
||||||
my $p = HTML::TableParser->new( \@request,
|
|
||||||
{ Decode => 1, Trim => 1, Chomp => 1, DecodeNBSP => 1 } );
|
|
||||||
$site=~s/ /\ \;/g;
|
|
||||||
$p->parse($site);
|
|
||||||
|
|
||||||
# put the rest on the stack, if any
|
$this->{Linecount} = 0;
|
||||||
if (@{$this->{section}}) {
|
$this->{Maxsize} = 0;
|
||||||
$this->{data}->{ $this->{title} } = $this->{section};
|
|
||||||
push @{$this->{segments}}, $this->{title};
|
# parse the XML
|
||||||
|
my $xml = new XML::Simple;
|
||||||
|
my $data = $xml->XMLin($site,
|
||||||
|
ForceArray => [ 'section', 'entry' ],
|
||||||
|
ForceContent => 1,
|
||||||
|
KeyAttr => { side => 'lang' }
|
||||||
|
);
|
||||||
|
|
||||||
|
my (@matches, $from_lang, $to_lang);
|
||||||
|
$from_lang = substr $lang{speak}, 0, 2;
|
||||||
|
$to_lang = substr $lang{speak}, 2, 2;
|
||||||
|
|
||||||
|
# parse all the <word>s and build a string
|
||||||
|
sub parse_word($) {
|
||||||
|
my $word = shift;
|
||||||
|
if (ref $word eq "HASH") {
|
||||||
|
if ($word->{content}) {
|
||||||
|
return encode('UTF-8', $word->{content});
|
||||||
|
}
|
||||||
|
elsif ($word->{cc}) {
|
||||||
|
# chinese simplified, traditional and pinyin
|
||||||
|
return encode('UTF-8', $word->{cc}->{cs}->{content} . "[" .
|
||||||
|
$word->{cc}->{ct}->{content} . "] " .
|
||||||
|
$word->{cc}->{pa}->{content});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
elsif (ref $word eq "ARRAY") {
|
||||||
|
return encode('UTF-8', @{$word}[-1]->{content});
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return encode('UTF-8', $word);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# put back in order
|
|
||||||
my @matches;
|
foreach my $section (@{$data->{sectionlist}->{section}}) {
|
||||||
foreach my $title (@{$this->{segments}}) {
|
my @entries;
|
||||||
push @matches, { title => $title, data => $this->{data}->{$title} };
|
foreach my $entry (@{$section->{entry}}) {
|
||||||
|
|
||||||
|
my $left = parse_word $entry->{side}->{$from_lang}->{words}->{word};
|
||||||
|
my $right = parse_word $entry->{side}->{$to_lang}->{words}->{word};
|
||||||
|
|
||||||
|
push @entries, { left => $left, right => $right };
|
||||||
|
if ($this->{Maxsize} < length($left)) {
|
||||||
|
$this->{Maxsize} = length($left);
|
||||||
|
}
|
||||||
|
$this->{Linecount}++;
|
||||||
|
}
|
||||||
|
push @matches, {
|
||||||
|
title => encode('UTF-8', $section->{sctTitle}),
|
||||||
|
data => \@entries
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
return @matches;
|
return @matches;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
sub hdr {
|
|
||||||
# HTML::TableParser header callback
|
|
||||||
my ( $this, $tbl_id, $line_no, $data, $udata ) = @_;
|
|
||||||
if ($data->[1] && $data->[0] eq $data->[1]) {
|
|
||||||
$this->debug("Probable start of a new section: $data->[1]");
|
|
||||||
if (@{$this->{section}}) {
|
|
||||||
$this->{data}->{ $this->{title} } = $this->{section};
|
|
||||||
push @{$this->{segments}}, $this->{title};
|
|
||||||
}
|
|
||||||
|
|
||||||
$this->{title} = $data->[1];
|
|
||||||
$this->{section} = [];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
sub row {
|
|
||||||
# HTML::TableParser data row callback
|
|
||||||
#
|
|
||||||
# divide rows into titles and lang data.
|
|
||||||
# we get 2 items (left and right column), if they
|
|
||||||
# are equal, it's a segment title, otherwise it's
|
|
||||||
# segment content. left columns ending in HH:MM
|
|
||||||
# are forumposts and ignored as well as rows with
|
|
||||||
# empty left cells.
|
|
||||||
my ( $this, $tbl_id, $line_no, $data, $udata ) = @_;
|
|
||||||
my $len = length($data->[0]);
|
|
||||||
if ($data->[1] && $data->[0] && $data->[0] ne $data->[1] && $data->[0] !~ /\d{2}:\d{2}$/) {
|
|
||||||
if ($len > $this->{Maxsize}) {
|
|
||||||
$this->{Maxsize} = $len;
|
|
||||||
}
|
|
||||||
$this->debug("line: $line_no, left: $data->[0], right: $data->[1]");
|
|
||||||
push @{$this->{section}}, { left => $data->[0], right => $data->[1] };
|
|
||||||
$this->{Linecount}++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
sub grapheme_length {
|
sub grapheme_length {
|
||||||
my($this, $str) = @_;
|
my($this, $str) = @_;
|
||||||
my $count = 0;
|
my $count = 0;
|
||||||
@@ -399,30 +374,6 @@ Parameters to control behavior of dict.leo.org:
|
|||||||
|
|
||||||
=over
|
=over
|
||||||
|
|
||||||
=item I<-SpellTolerance>
|
|
||||||
|
|
||||||
Be tolerant to spelling errors.
|
|
||||||
|
|
||||||
Default: turned on.
|
|
||||||
|
|
||||||
Possible values: on, off.
|
|
||||||
|
|
||||||
=item I<-Morphology>
|
|
||||||
|
|
||||||
Provide morphology information.
|
|
||||||
|
|
||||||
Default: standard.
|
|
||||||
|
|
||||||
Possible values: standard, none, forcedAll.
|
|
||||||
|
|
||||||
=item I<-CharTolerance>
|
|
||||||
|
|
||||||
Allow umlaut alternatives.
|
|
||||||
|
|
||||||
Default: relaxed.
|
|
||||||
|
|
||||||
Possible values: fuzzy, exact, relaxed.
|
|
||||||
|
|
||||||
=item I<-Language>
|
=item I<-Language>
|
||||||
|
|
||||||
Translation direction. Please note that dict.leo.org always translates
|
Translation direction. Please note that dict.leo.org always translates
|
||||||
|
|||||||
Reference in New Issue
Block a user