您的位置:首页 > 其它

关于中文文件名字转码

2011-05-05 13:24 267 查看
在Unix服务器上中文一般以utf-8表示,而是用IIS同步的文件可能是以gb2312编码的。

现在服务器上这两种编码的文件都存在。

IE浏览器默认是用utf-8编码发送中文URL,迅雷比较猛,可以尝试多种编码方法。

这样就出现了一种情况,有时IE会下载不了服务器上名字以gb2312编码的文件。

解决方法:服务器保存两个文件对应相同,即两种编码对应不同文件名字。

开始尝试判断文件编码类型,失败告终。

是用perl Encode模块的一个函数(is_utf8),后来证明是不可行的。该函数并不能判断编码方式。

后改用强制转换,但是效果不好,可能会出现错误转换的情况。

#!/usr/bin/perl -w
use Encode;
use utf8;
open file_list,">filelist" or die "file open error $!";
&find_fileindir("/home/hy/case/bbk_test");
close file_list;
open file_list,"<filelist" or die "file open error $!";
while (<file_list>)
{
my $dir = `dirname $_`;
chomp $dir;
my $base = `basename $_`;
chomp $base;
&ecode_utf8_gb($dir,$base);
}
close file_list;
sub find_fileindir{
local($dir) = @_;
opendir(DIR,"$dir"|| die "can't open this $dir");
local @files =readdir(DIR);
closedir(DIR);
for $file (@files){
next if($file=~m//.$/ || $file =~m//./.$/);
if(-d "$dir/$file"){
find_fileindir("$dir/$file" );
}
elsif ( -f "$dir/$file" ){
print file_list "$dir//$file /n";
}
}
}
sub ecode_utf8_gb
{
my ($dir,$filen) = @_;
my $filen_encoded = $filen;
my $result = Encode::from_to($filen_encoded,"gb2312","UTF-8");
if( $result )
{
print $result."/n";
if(! -f "$dir/$filen_encoded")
{
print "1-cp $dir/$filen $dir/$filen_encoded/n";
print `cp $dir/$filen $dir/$filen_encoded` ;#or print "cp error $filen";
}
else
{
print "exits $dir/$filen_encoded/n";
}
}
else
{
print "nothing to do/n";
}
my $filen_encoded = $filen;
$result = Encode::from_to($filen_encoded,"UTF-8","gb2312");
if( $result )
{
print $result."/n";
if(! -f "$dir/$filen_encoded")
{
print "2-cp $dir/$filen $dir/$filen_encoded/n";
print `cp $dir/$filen $dir/$filen_encoded` ;#or print "cp error $filen";
}
else
{
print "exits $dir/$filen_encoded/n";
}
}
else
{
print "nothing to do u/n";
}
}


后来用C写了个。效果比较好,使用了软连接。 iconv判断字符编码还是比较准确的。

#include "stdio.h"
#include "iconv.h"
#include <unistd.h>
#include <dirent.h>
#include <string.h>
#include <sys/stat.h>
#include <errno.h>
#define MAX_URL_LENGTH 1024
// This is used to encode & decode between gb2312 and utf-8
static int code_convert(char *from_charset,char *to_charset,char *inbuf,size_t inlen,char *outbuf,size_t outlen);
static int u2g(char* orig_name,char* new_name,size_t length);
static int g2u(char* orig_name,char* new_name,size_t length);
static int level  = 10;
#define DEBUG(args, ...)        if( level > 3)  {   printf(args, ##__VA_ARGS__);fflush(stdout);}
int main(int argc, char *argv[])
{
int i = 0;
if(argc != 2 ){
DEBUG("please input format: ppk_conert convert_dir /n ") ;
DEBUG("convert_dir : has abselute directory like :/home/test//n ") ;
return 1;
}
int ret = 0;
DIR *dp;
struct dirent *entry;
struct stat statbuf;
char *dir= argv[1];
FILE *fd;
fd = fopen("result.txt","w+");
if((dp = opendir(dir)) == NULL) {
DEBUG("cannot open directory: %s/n ", dir);
return 0;
}
chdir(dir);
char new_name[MAX_URL_LENGTH*2] = "0";
while((entry = readdir(dp)) != NULL) {
lstat(entry->d_name, &statbuf);
if(S_ISDIR(statbuf.st_mode)){
if(strcmp( ".",entry->d_name) == 0 ||strcmp( "..",entry->d_name) == 0)
continue;
continue;
}
char *origin_name = entry->d_name;
memset(new_name,0,MAX_URL_LENGTH*2);
ret = u2g(origin_name,new_name,MAX_URL_LENGTH*2);
if(ret != -1)
{
int res = symlink(origin_name,new_name);
if(res != 0 ){
DEBUG("symlink fail because:orig_file %s new_file %s error %s/n",origin_name,new_name,strerror(errno));
}
else{
DEBUG("u2g success convert orig_file: %s to new_file: %s /n",origin_name,new_name);
fprintf(fd,"u2g success convert orig_file: %s to new_file: %s /n",origin_name,new_name);
}
continue;
}
memset(new_name,0,MAX_URL_LENGTH*2);
ret = g2u(origin_name,new_name,MAX_URL_LENGTH*2);
if(ret != -1)
{
int res = symlink(origin_name,new_name);
if(res != 0 ){
DEBUG("symlink fail because:orig_file %s new_file %s error %s/n",origin_name,new_name,strerror(errno));
}
else{
DEBUG("g2u success convert orig_file: %s to new_file: %s /n",origin_name,new_name);
fprintf(fd,"g2u success convert orig_file: %s to new_file: %s /n",origin_name,new_name);
}
continue;
}
}
fclose(fd);
closedir(dp);
DEBUG("the successful converting file list is in result.txt at current dir, please checking/n");
return 0;
}

static int u2g(char* orig_name,char* new_name,size_t length)
{
char buf_in[MAX_URL_LENGTH] = "0" ;
char buf_out[MAX_URL_LENGTH] = "0";
int ret = 0;
strcpy(buf_in,orig_name);
ret = code_convert("utf-8","gb2312",buf_in,MAX_URL_LENGTH,new_name,length);
if(-1 == ret)
{
return ret;
}
return ret;
}
static int g2u(char* orig_name,char* new_name,size_t length)
{
char buf_in[MAX_URL_LENGTH] = "0";
char buf_out[MAX_URL_LENGTH*2] = "0";
int ret = 0;
strcpy(buf_in,orig_name);
ret = code_convert("gb2312","utf-8",buf_in,MAX_URL_LENGTH,new_name,length);
if(-1 == ret)
{
return ret;
}
return ret;
}
static int code_convert(char *from_charset,char *to_charset,char *inbuf,size_t inlen,char *outbuf,size_t outlen)
{
iconv_t cd;
char **pin = &inbuf;
char **pout = &outbuf;
cd = iconv_open(to_charset,from_charset);
if (cd==0) return -1;
memset(outbuf,0,outlen);
if (iconv(cd,pin,&inlen,pout,&outlen)==-1)
{
DEBUG("iconv error %s /n",inbuf);
iconv_close(cd);
return -1;
}
DEBUG("iconv ok /n");
iconv_close(cd);
return 0;
}


这种解决方法是被迫的,最好的解决方案是统一使用utf8编码,这样就避免了各种编码混乱带来的不便。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐