#!/usr/bin/perl # use strict; use warnings; use utf8; use Encode; use FileHandle; use Text::CSV_XS; use HTML::Entities; sub getInflections(**) { my ($arg, $infs) = @_; @{$infs} = ${$arg} =~ /【変化】《.+?》(.*?)(、|
).*$/; if ($#{$infs} >= 0) { $infs->[0] =~ s/ +(\| +)*?《.+?》/ \| /g; @{$infs} = grep(!/\s/, split(' \| ',$infs->[0])); # more xxxx, most xxxx のパターンを排除する # print encode('shift_jis', "inflections: " . join('//', @{$infs}) . "\n"); # print encode('shift_jis', "inflections: " . $infs->[0] . "\n"); # print encode('shift_jis', "rest: " . $infs->[1] . "\n=====\n"); } } sub Substitute(*) { my $arg = $_[0]; ${$arg} =~ s/
/ /g; ${$arg} =~ s/【@】.*?【/【/g; ${$arg} =~ s/【変化】《.+?》.*?(、|
)//g; ${$arg} =~ s/【大学入試】//g; ${$arg} =~ s/ \| /\//g; ${$arg} =~ tr/。、【】《》[]%_()=^/.,[]<>[]%_()=^/; # ${$arg} =~ s/◆ */ /g; ${$arg} =~ s/-([0-9])\]/$1\]/g; } sub printElements(*) { my($array) = @_; my $i = 0; my $str = ""; # /* $#{$array} */ for ($i = 0; $i <= 1 ; $i++) { $str = $array->[$i]; Substitute(\$str); print encode('shift_jis', "[$i] $str\n"); } } # ======= # M A I N # ======= # die "please specify the name of file for input.\n" if ($#ARGV < 0); die "please specify the name of your Dictionary.\n" if ($#ARGV < 1); my $charCodeOfCSV = 'shift_jis'; $charCodeOfCSV = $ARGV[2] if $#ARGV >= 2; my $unsafe_chars = '<>&"'; my $fileName = $ARGV[0]; my $dictionaryName = decode('shift_jis', encode_entities($ARGV[1], $unsafe_chars)); my $htmlFirstPage = < $dictionaryName

$dictionaryName

 TOP PAGE 


検索 | */?

EOF print encode('utf-8', $htmlFirstPage); my $fh = new FileHandle; $fh->open($fileName) || die "error, Cannot open $fileName¥n"; my $encoder = find_encoding($charCodeOfCSV); my $csv = Text::CSV_XS->new({binary=>1}); #日本語を読み込むときはbinaryを1にする # 一行ごとに読み込む my $i = 0; while(<$fh>){ chomp; $i++; print STDERR "[$i]\r" unless ($i % 2000); # next if $i < 193400; # last if $i > 193500; my $line = $encoder->decode($_); while ($line =~ /\r$/) { last unless (defined($_ = <$fh>)); chomp; chop $line; $line .= "
"; $line .= $encoder->decode($_); } # CSV形式の一行をパース。parseが失敗したら偽になるらしい $csv->parse($line) or next; # parseとセット。parseが成功したCSV形式の行を配列として取得 my @fields = $csv->fields(); my ($escapedKey, $key, $explain, $entry, $pieceOfExplain, $markContinuous); $escapedKey = $key = encode_entities($fields[0], $unsafe_chars); $escapedKey =~ s/\\/\\\\/g; $escapedKey =~ s/'/\\'/g; my (@inflections, $inflectionTags); $explain = $fields[1] . $fields[2]; getInflections(\$explain, \@inflections); $inflectionTags = ""; if ($#inflections >= 0) { foreach my $inf (@inflections) { $inflectionTags .= "\n"; } $inflectionTags = "\n$inflectionTags\n"; } Substitute(\$explain); $markContinuous = ""; while (length($pieceOfExplain = substr($explain, 0, 2048, '')) > 0) { encode_entities($pieceOfExplain, $unsafe_chars); $entry = <

$key\n$inflectionTags

$markContinuous$pieceOfExplain
EOF print encode('utf-8', $entry); $markContinuous = "[続き]" if ($markContinuous eq ''); } } $fh->close(); print < EOF print STDERR "\n";