#!/usr/bin/perl
#
use strict;
use warnings;
use utf8;
use Encode;
use FileHandle;
use Text::CSV_XS;
use HTML::Entities;
sub getInflections(**)
{
my ($arg, $infs) = @_;
@{$infs} = ${$arg} =~ /【変化】《.+?》(.*?)(、|
).*$/;
if ($#{$infs} >= 0) {
$infs->[0] =~ s/ +(\| +)*?《.+?》/ \| /g;
@{$infs} = grep(!/\s/, split(' \| ',$infs->[0])); # more xxxx, most xxxx のパターンを排除する
# print encode('shift_jis', "inflections: " . join('//', @{$infs}) . "\n");
# print encode('shift_jis', "inflections: " . $infs->[0] . "\n");
# print encode('shift_jis', "rest: " . $infs->[1] . "\n=====\n");
}
}
sub Substitute(*)
{
my $arg = $_[0];
${$arg} =~ s/
/ /g;
${$arg} =~ s/【@】.*?【/【/g;
${$arg} =~ s/【変化】《.+?》.*?(、|
)//g;
${$arg} =~ s/【大学入試】//g;
${$arg} =~ s/ \| /\//g;
${$arg} =~ tr/。、【】《》[]%_()=^/.,[]<>[]%_()=^/;
# ${$arg} =~ s/◆ */ /g;
${$arg} =~ s/-([0-9])\]/$1\]/g;
}
sub printElements(*)
{
my($array) = @_;
my $i = 0;
my $str = "";
# /* $#{$array} */
for ($i = 0; $i <= 1 ; $i++) {
$str = $array->[$i];
Substitute(\$str);
print encode('shift_jis', "[$i] $str\n");
}
}
# =======
# M A I N
# =======
#
die "please specify the name of file for input.\n" if ($#ARGV < 0);
die "please specify the name of your Dictionary.\n" if ($#ARGV < 1);
my $charCodeOfCSV = 'shift_jis';
$charCodeOfCSV = $ARGV[2] if $#ARGV >= 2;
my $unsafe_chars = '<>&"';
my $fileName = $ARGV[0];
my $dictionaryName = decode('shift_jis', encode_entities($ARGV[1], $unsafe_chars));
my $htmlFirstPage = <
$dictionaryName
$dictionaryName
TOP PAGE
検索 |
*/?
EOF
print encode('utf-8', $htmlFirstPage);
my $fh = new FileHandle;
$fh->open($fileName) || die "error, Cannot open $fileName¥n";
my $encoder = find_encoding($charCodeOfCSV);
my $csv = Text::CSV_XS->new({binary=>1}); #日本語を読み込むときはbinaryを1にする
# 一行ごとに読み込む
my $i = 0;
while(<$fh>){
chomp;
$i++;
print STDERR "[$i]\r" unless ($i % 2000);
# next if $i < 193400;
# last if $i > 193500;
my $line = $encoder->decode($_);
while ($line =~ /\r$/) {
last unless (defined($_ = <$fh>));
chomp;
chop $line;
$line .= "
";
$line .= $encoder->decode($_);
}
# CSV形式の一行をパース。parseが失敗したら偽になるらしい
$csv->parse($line) or next;
# parseとセット。parseが成功したCSV形式の行を配列として取得
my @fields = $csv->fields();
my ($escapedKey, $key, $explain, $entry, $pieceOfExplain, $markContinuous);
$escapedKey = $key = encode_entities($fields[0], $unsafe_chars);
$escapedKey =~ s/\\/\\\\/g;
$escapedKey =~ s/'/\\'/g;
my (@inflections, $inflectionTags);
$explain = $fields[1] . $fields[2];
getInflections(\$explain, \@inflections);
$inflectionTags = "";
if ($#inflections >= 0) {
foreach my $inf (@inflections) {
$inflectionTags .= "\n";
}
$inflectionTags = "\n$inflectionTags\n";
}
Substitute(\$explain);
$markContinuous = "";
while (length($pieceOfExplain = substr($explain, 0, 2048, '')) > 0) {
encode_entities($pieceOfExplain, $unsafe_chars);
$entry = <
•$key\n$inflectionTags
$markContinuous$pieceOfExplain
EOF
print encode('utf-8', $entry);
$markContinuous = "[続き]" if ($markContinuous eq '');
}
}
$fh->close();
print <
EOF
print STDERR "\n";