您的位置:首页 > 编程语言 > Python开发

【Python开发】Url中文字符时记得转码edcode("utf-8")

2016-09-26 22:13 603 查看
在url中使用中文其实是一个坏习惯,会带来一系列的转码问题, 我更喜欢英文译名或者id来标识某个uri。但是现实往往是残酷的, 特别是在我们调用别人服务时候,有时候被逼无奈使用中文URL。

Python中unicode转码一向是让人头疼的问题。数次碰壁之后,我也摸出了一些门道, 研读完Python字符串的encode与decode 之后,就自认为找到了万金油,谁知道这次又碰上这个老冤家。

01
Traceback (most recent call last):
02
  
File
 
"<stdin>"
, line
 
1
,
 
in
 
<module>
03
  
File
 
"/usr/lib/python2.6/urllib2.py"
, line
 
126
,
 
in
 
urlopen
04
    
return
 
_opener.
open
(url, data, timeout)
05
  
File
 
"/usr/lib/python2.6/urllib2.py"
, line
 
391
,
 
in
 
open
06
    
response
 
=
 
self
._open(req, data)
07
  
File
 
"/usr/lib/python2.6/urllib2.py"
, line
 
409
,
 
in
 
_open
08
    
'_open'
, req)
09
  
File
 
"/usr/lib/python2.6/urllib2.py"
, line
 
369
,
 
in
 
_call_chain
10
    
result
 
=
 
func(
*
args)
11
  
File
 
"/usr/lib/python2.6/urllib2.py"
, line
 
1170
,
 
in
 
http_open
12
    
return
 
self
.do_open(httplib.HTTPConnection, req)
13
  
File
 
"/usr/lib/python2.6/urllib2.py"
, line
 
1142
,
 
in
 
do_open
14
    
h.request(req.get_method(), req.get_selector(), req.data, headers)
15
  
File
 
"/usr/lib/python2.6/httplib.py"
, line
 
914
,
 
in
 
request
16
    
self
._send_request(method, url, body, headers)
17
  
File
 
"/usr/lib/python2.6/httplib.py"
, line
 
951
,
 
in
 
_send_request
18
    
self
.endheaders()
19
  
File
 
"/usr/lib/python2.6/httplib.py"
, line
 
908
,
 
in
 
endheaders
20
    
self
._send_output()
21
  
File
 
"/usr/lib/python2.6/httplib.py"
, line
 
780
,
 
in
 
_send_output
22
    
self
.send(msg)
23
  
File
 
"/usr/lib/python2.6/httplib.py"
, line
 
759
,
 
in
 
send
24
    
self
.sock.sendall(
str
)
25
  
File
 
"<string>"
, line
 
1
,
 
in
 
sendall
26
UnicodeEncodeError:
 
'ascii'
 
codec can't encode characters
 
in
 
position
 
7
-
8
: ordinal
 
not
 
in
 
range
(
128
)
这次错误引发是在 urlopen() 引起的,很有特色,开始使用 url.encode('utf-8') 就可以解决了。 今天我做了一些测试。


1. ascii + unicode 测试

01
>>>
 
'a'
 
+
 
u
'b'
02
>>>
 
'你'
 
+
 
u
'好'
03
Traceback (most recent call last):
04
  
File
 
"<stdin>"
, line
 
1
,
 
in
 
<module>
05
UnicodeDecodeError:
 
'ascii'
 
codec can't decode byte
 
0xe4
 
in
 
position
 
0
: ordinal
 
not
 
in
 
range
(
128
)
06
>>> u
'你'
 
+
 
u
'好'
07
u
'\u4f60\u597d'
08
>>> u
'a'
 
+
 
'你'
 
+
 
u
'好'
09
Traceback (most recent call last):
10
  
File
 
"<stdin>"
, line
 
1
,
 
in
 
<module>
11
UnicodeDecodeError:
 
'ascii'
 
codec can't decode byte
 
0xe4
 
in
 
position
 
0
: ordinal
 
not
 
in
 
range
(
128
)
上面的测试说明ascii码和unicode码相连操作,结论是有中文记得带上u,就不会有问题。 Python默认解码器是ascii,无法解码unicode中的中文。


2. urllib2的测试

01
>>>
 
import
 
urllib2
02
>>> h1
 
=
 
'http://baidu.com'
03
>>> urllib2.urlopen(h1)
04
<addinfourl at
 
153439532
 
whose fp
 
=
 
<socket._fileobject
 
object
 
at
 
0xb74e51ac
>>
05
>>> h2
 
=
 
u
'http://baidu.com'
06
>>> urllib2.urlopen(h2)
07
<addinfourl at
 
153440236
 
whose fp
 
=
 
<socket._fileobject
 
object
 
at
 
0x925912c
>>
08
>>> h3
 
=
 
'http://baidu.com?w=测试'
09
>>> urllib2.urlopen(h3)
10
<addinfourl at
 
153482348
 
whose fp
 
=
 
<socket._fileobject
 
object
 
at
 
0x92593ac
>>
11
>>> h4
 
=
 
u
'http://baidu.com?w=测试'
12
>>> urllib2.urlopen(h4)
13
Traceback (most recent call last):
14
  
File
 
"<stdin>"
, line
 
1
,
 
in
 
<module>
15
  
File
 
"/usr/lib/python2.6/urllib2.py"
, line
 
126
,
 
in
 
urlopen
16
    
return
 
_opener.
open
(url, data, timeout)
17
  
File
 
"/usr/lib/python2.6/urllib2.py"
, line
 
391
,
 
in
 
open
18
    
response
 
=
 
self
._open(req, data)
19
  
File
 
"/usr/lib/python2.6/urllib2.py"
, line
 
409
,
 
in
 
_open
20
    
'_open'
, req)
21
  
File
 
"/usr/lib/python2.6/urllib2.py"
, line
 
369
,
 
in
 
_call_chain
22
    
result
 
=
 
func(
*
args)
23
  
File
 
"/usr/lib/python2.6/urllib2.py"
, line
 
1170
,
 
in
 
http_open
24
    
return
 
self
.do_open(httplib.HTTPConnection, req)
25
  
File
 
"/usr/lib/python2.6/urllib2.py"
, line
 
1142
,
 
in
 
do_open
26
    
h.request(req.get_method(), req.get_selector(), req.data, headers)
27
  
File
 
"/usr/lib/python2.6/httplib.py"
, line
 
914
,
 
in
 
request
28
    
self
._send_request(method, url, body, headers)
29
  
File
 
"/usr/lib/python2.6/httplib.py"
, line
 
951
,
 
in
 
_send_request
30
    
self
.endheaders()
31
  
File
 
"/usr/lib/python2.6/httplib.py"
, line
 
908
,
 
in
 
endheaders
32
    
self
._send_output()
33
  
File
 
"/usr/lib/python2.6/httplib.py"
, line
 
780
,
 
in
 
_send_output
34
    
self
.send(msg)
35
  
File
 
"/usr/lib/python2.6/httplib.py"
, line
 
759
,
 
in
 
send
36
    
self
.sock.sendall(
str
)
37
  
File
 
"<string>"
, line
 
1
,
 
in
 
sendall
38
UnicodeEncodeError:
 
'ascii'
 
codec can't encode characters
 
in
 
position
 
7
-
8
: ordinal
 
not
 
in
 
range
(
128
)
这个测试说明, urllib2.urlopen() 可以接受ascii/unicode的英文,也可以接受ascii的中文, 但是一旦是unicode的中文url,就会报转码错误。

so,请尽量英文url,非要用中文,请记得转码。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐