simple/sanitize_umlauts/sanitize_umlauts_utf8

70 lines
1.3 KiB
Plaintext
Raw Permalink Normal View History

#!/usr/bin/perl -w
#
# $Id: sanitize_umlauts_utf8,v 1.1 2006-03-08 18:08:52 hjp Exp $
#
use strict;
use File::Find;
use Encode;
binmode STDOUT, ":raw";
my %ex = (
"\204" => "<22>",
"\224" => "<22>",
"\201" => "<22>",
"\216" => "<22>",
"\231" => "<22>",
"\232" => "<22>",
"\202" => "<22>",
);
sub wanted {
eval {
decode("utf-8", $_, 1);
};
if ($@) {
my $new = "";
for (split(//, $_)) {
if (ord ($_) >= 0x0021 && ord($_) <= 0x007E) {
$new .= $_;
} else {
if (defined $ex{$_}) {
$new .= $ex{$_};
} else {
$new .= $_;
}
}
}
$new = encode("utf-8", $new);
print $File::Find::dir, ": $_ -> $new\n";
rename $_, $new or die "cannot rename $_ to $new: $!";
}
# if (/[\204\224\201\216\231\232\341\202]/) {
# my $new = $_;
# $new =~ tr/\204\224\201\216\231\232\341\202/<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>/;
# }
}
if (@ARGV == 0) { push (@ARGV, "."); }
finddepth(\&wanted, @ARGV);
print "\n\n";
# $Log: sanitize_umlauts_utf8,v $
# Revision 1.1 2006-03-08 18:08:52 hjp
# Removed szlig conversion. Too dangerous as \341 is a valid character in
# latin-1, too.
#
# Revision 1.1 2002/10/27 12:28:59 hjp
# *** empty log message ***
#
# vim:sw=4 expandtab