From 0e56ed880b55924e16104c77a14c079a499b521d Mon Sep 17 00:00:00 2001 From: hjp Date: Wed, 8 Mar 2006 18:08:52 +0000 Subject: [PATCH] Removed szlig conversion. Too dangerous as \341 is a valid character in latin-1, too. --- sanitize_umlauts/sanitize_umlauts | 12 +++-- sanitize_umlauts/sanitize_umlauts_utf8 | 69 ++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 4 deletions(-) create mode 100755 sanitize_umlauts/sanitize_umlauts_utf8 diff --git a/sanitize_umlauts/sanitize_umlauts b/sanitize_umlauts/sanitize_umlauts index 1140b7d..5415718 100755 --- a/sanitize_umlauts/sanitize_umlauts +++ b/sanitize_umlauts/sanitize_umlauts @@ -1,6 +1,6 @@ #!/usr/bin/perl -w # -# $Id: sanitize_umlauts,v 1.1 2002-10-27 12:28:59 hjp Exp $ +# $Id: sanitize_umlauts,v 1.2 2006-03-08 18:08:52 hjp Exp $ # use strict; @@ -9,9 +9,9 @@ use File::Find; sub wanted { - if (/[\204\224\201\216\231\232\341\202]/) { + if (/[\204\224\201\216\231\232\202]/) { my $new = $_; - $new =~ tr/\204\224\201\216\231\232\341\202/äöüÄÖÜßé/; + $new =~ tr/\204\224\201\216\231\232\202/äöüÄÖÜé/; print $File::Find::dir, ": $_ -> $new\n"; rename $_, $new or die "cannot rename $_ to $new: $!"; } @@ -26,6 +26,10 @@ print "\n\n"; # $Log: sanitize_umlauts,v $ -# Revision 1.1 2002-10-27 12:28:59 hjp +# Revision 1.2 2006-03-08 18:08:52 hjp +# Removed szlig conversion. Too dangerous as \341 is a valid character in +# latin-1, too. +# +# Revision 1.1 2002/10/27 12:28:59 hjp # *** empty log message *** # diff --git a/sanitize_umlauts/sanitize_umlauts_utf8 b/sanitize_umlauts/sanitize_umlauts_utf8 new file mode 100755 index 0000000..06139b9 --- /dev/null +++ b/sanitize_umlauts/sanitize_umlauts_utf8 @@ -0,0 +1,69 @@ +#!/usr/bin/perl -w +# +# $Id: sanitize_umlauts_utf8,v 1.1 2006-03-08 18:08:52 hjp Exp $ +# + +use strict; +use File::Find; +use Encode; + +binmode STDOUT, ":raw"; + +my %ex = ( + "\204" => "ä", + "\224" => "ö", + "\201" => "ü", + "\216" => "Ä", + "\231" => "Ö", + "\232" => "Ü", + "\202" => "é", +); + +sub wanted { + + eval { + decode("utf-8", $_, 1); + }; + if ($@) { + my $new = ""; + for (split(//, $_)) { + if (ord ($_) >= 0x0021 && ord($_) <= 0x007E) { + $new .= $_; + } else { + if (defined $ex{$_}) { + $new .= $ex{$_}; + } else { + $new .= $_; + } + } + } + $new = encode("utf-8", $new); + print $File::Find::dir, ": $_ -> $new\n"; + rename $_, $new or die "cannot rename $_ to $new: $!"; + } + + +# if (/[\204\224\201\216\231\232\341\202]/) { +# my $new = $_; +# $new =~ tr/\204\224\201\216\231\232\341\202/äöüÄÖÜßé/; +# } +} + + + +if (@ARGV == 0) { push (@ARGV, "."); } +finddepth(\&wanted, @ARGV); + +print "\n\n"; + + +# $Log: sanitize_umlauts_utf8,v $ +# Revision 1.1 2006-03-08 18:08:52 hjp +# Removed szlig conversion. Too dangerous as \341 is a valid character in +# latin-1, too. +# +# Revision 1.1 2002/10/27 12:28:59 hjp +# *** empty log message *** +# +# vim:sw=4 expandtab +