From 7944c09286c112f5d2880c4c4d3fdf9a4a86aa1a Mon Sep 17 00:00:00 2001 From: Guilhem Moulin Date: Wed, 7 Mar 2012 01:10:19 +0100 Subject: unicode, re --- videodb-check.pl | 48 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 8 deletions(-) (limited to 'videodb-check.pl') diff --git a/videodb-check.pl b/videodb-check.pl index dae4cdb..b9fb4ca 100755 --- a/videodb-check.pl +++ b/videodb-check.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -CAL +#!/usr/bin/perl -CADS # This program is free software. It comes without any warranty, to the # extent permitted by applicable law. You can redistribute it and/or @@ -10,6 +10,8 @@ $VERSION = "0.1, 27 November 2011"; use warnings; use strict; +use utf8; +use feature "unicode_strings"; use DBI; use Pod::Usage; @@ -44,7 +46,8 @@ B =head1 DISCLAIMER -Your collection is assumed to have the following structure: two +Your collection is assumed to be encoded in UTF-8, and to have the following +structure: two folders, I and I, that have the same parent. I contains one subdirectory for each director, and each movie lies (B) in the subdirectory of its director. @@ -161,6 +164,11 @@ my @links; # List of missing symlinks my @files; # List of symlinks that have a non existing/valid target my @dirs; # List of symlinks that target to a wrong director foreach (@$res) { + # The database is using UTF-8, but the UTF-8 flag may be missing; + # restore it. + Encode::_utf8_on($_->[0]); + Encode::_utf8_on($_->[1]); + my @ls = File::Spec->splitdir($_->[0]); my $l = File::Spec->catfile ($symlinks, $ls[0]); @@ -170,6 +178,7 @@ foreach (@$res) { } my @target = File::Spec->splitdir (readlink $l); + map {Encode::_utf8_on($_)} @target; push @files, $_->[0] unless ( $target[0] eq File::Spec->updir() and $target[1] eq "DIRECTORS" @@ -199,10 +208,14 @@ while (my $l = readdir(DIR)) { next if $l eq File::Spec->curdir(); next if $l eq File::Spec->updir(); + Encode::_utf8_on($l); my $f = File::Spec->catfile($symlinks, $l); - if ( -d Cwd::realpath($f) ) { - opendir (SUBDIR, Cwd::realpath($f)) - or die "Can't open dir `" .Cwd::realpath($f). ".:" .$!. "\n"; + Encode::_utf8_on($f); + my $rf = Cwd::realpath($f); + Encode::_utf8_on($rf); + if ( -d $rf ) { + opendir (SUBDIR, $rf) + or die "Can't open dir `" .$rf. ".:" .$!. "\n"; while (my $d = readdir(SUBDIR)) { next if $d eq File::Spec->curdir(); next if $d eq File::Spec->updir(); @@ -221,7 +234,7 @@ undef @files; my $sth = $dbh->prepare ( "SELECT id FROM $config{prefix}videodata WHERE filename = ?" ) or die "Error: " .$dbh->errstr; foreach (@filelist) { - $sth->execute ($_) or die "Can't select: " .$dbh->errstr. "\n"; + $sth->execute (&iconv($_)) or die "Can't select: " .$dbh->errstr. "\n"; my @res = $sth->fetchrow_array; die $sth->errstr if $sth->err; @@ -249,9 +262,9 @@ exit $r; sub report { my ($str, $bad) = @_; if (@$bad) { - print LOG "* " .$str. ":\n"; + print LOG &iconv("* " .$str. ":\n"); foreach (@$bad) { - print LOG " ", $_, "\n"; + print LOG &iconv(" " .$_. "\n"); } $r = 1; } @@ -262,5 +275,24 @@ sub check { FROM $config{prefix}videodata WHERE " .$_[1] ) or die "Can't select: " .$dbh->errstr. "\n"; + # The database is using UTF-8, but the UTF-8 flag may be missing; + # restore it. + Encode::_utf8_on($_->[0]); &report ("The following files have " .$_[0], [map {$_->[0]} @$bad]); } + +# Convert to UTF-8 +# The input string should be in latin1 unless the UTF-8 flag is on. +sub iconv { + my $string = $_[0]; + return unless defined $string; + + unless (Encode::is_utf8($string)) { + $string = Encode::decode( 'latin1', $string, 1 ); + } + + # The UTF-8 flag should be on now + warn "Warning: Not a valid Unicode string: \"$string\".\n" + unless utf8::valid($string); + return $string; +} -- cgit v1.2.3