Perl does not attempt to decode filenames returned by builtin functions or modules. Such strings representing filenames should always be decoded explicitly, in order for Perl to recognize them as Unicode.
use v5.14;
use Encode qw(decode_utf8);
# Ensure that possible error messages printed to screen are converted to UTF-8.
# For this to work: Check that you terminal emulator is using UTF-8.
binmode STDOUT, ':utf8';
binmode STDERR, ':utf8';
# Example1 -- using readdir()
my $dir = '.';
opendir(my $dh, $dir) or die "Could not open directory '$dir': $!";
while (my $filename = decode_utf8(readdir $dh)) {
# Do something with $filename
}
close $dh;
# Example2 -- using getcwd()
use Cwd qw(getcwd);
my $dir = decode_utf8( getcwd() );
# Example3 -- using abs2rel()
use File::Spec;
use utf8;
my $base = 'ø';
my $path = "$base/b/æ";
my $relpath = decode_utf8( File::Spec->abs2rel( $path, $base ) );
# Note: If you omit $base, you need to encode $path first:
use Encode qw(encode_utf8);
my $relpath = decode_utf8( File::Spec->abs2rel( encode_utf8( $path ) ) );
# Example4 -- using File::Find::Rule (part1 matching a filename)
use File::Find::Rule;
use utf8;
use Encode qw(encode_utf8);
my $filename = 'æ';
# File::Find::Rule needs $filename to be encoded
my @files = File::Find::Rule->new->name( encode_utf8($filename) )->in('.');
$_ = decode_utf8( $_ ) for @files;
# Example5 -- using File::Find::Rule (part2 matching a regular expression)
use File::Find::Rule;
use utf8;
my $pat = '[æ].$'; # Unicode pattern
# Note: In this case: File::Find::Rule->new->name( qr/$pat/ )->in('.')
# will not work since $pat is Unicode and filenames are bytes
# Also encoding $pat first will not work correctly
my @files;
File::Find::Rule->new->exec( sub { wanted( $pat, \@files ) } )->in('.');
$_ = decode_utf8( $_ ) for @files;
sub wanted {
my ( $pat, $files ) = @_;
my $name = decode_utf8( $_ );
my $full_name = decode_utf8( $File::Find::name );
push @$files, $full_name if $name =~ /$pat/;
}
Note: if you are concerned about invalid UTF-8 in the filenames, the use of decode_utf8( ... )
in the above examples should probably be replaced by decode( 'utf-8', ... )
. This is because decode_utf8( ... )
is a synonym for decode( 'utf8', ... )
and there is a difference between the encodings utf-8
and utf8
(see Remarks below for more information) where utf-8
is more strict on what is acceptable than utf8
.