您的位置：首页 > 编程语言 > Python开发

Python: 'gbk' codec can't encode character '\u30fb' in position 0: illegal multibyte sequence

2017-11-22 10:40 302 查看

先上代码：

代码是通过爬去王者荣耀官方网站的数据，然后再写入文件中，获取的英雄，铭文等信息写入时都没有问题，但是写入装备信息时就出现编码的问题了

# -*- coding: utf-8 -*-
"""
Created on Thu Nov 16 16:25:44 2017

@author: 10183930
"""

import requests
import csv
import re
import os

JSONHEAD = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36'}

def getHeroinfo():
HERO = 'http://pvp.qq.com/web201605/js/herolist.json'
html_hero = requests.get(HERO,headers=JSONHEAD)
html_hero = requests.get(HERO)
html_hero_json = html_hero.json()
#获取英雄列表
hero_name = list(map(lambda x:x['cname'],html_hero_json))
hero_number = list(map(lambda x:x['ename'],html_hero_json))
hero_type = list(map(lambda x:x['hero_type'],html_hero_json))
#hero_type2 = list(map(lambda x:x['hero_type2'],html_hero_json))

return zip(hero_name,hero_number,hero_type)

def getIteminfo():
ITEM = 'http://pvp.qq.com/web201605/js/item.json'
html_item = requests.get(ITEM,headers=JSONHEAD)
html_item = requests.get(ITEM)
html_item_json = html_item.json()
#获取装备列表
item_name = list(map(lambda x:x['item_name'],html_item_json))
item_des = list(map(lambda x:x['des1'],html_item_json))

return zip(item_name,item_des)

def getSummonerinfo():
SUMMONER = 'http://pvp.qq.com/web201605/js/summoner.json'
html_summoner = requests.get(SUMMONER,headers=JSONHEAD)
html_summoner = requests.get(SUMMONER)
html_summoner_json = html_summoner.json()
#获取召唤师技能列表
summoner_des = list(map(lambda x:x['summoner_description'],html_summoner_json))
summoner_name = list(map(lambda x:x['summoner_name'],html_summoner_json))
summoner_rank = list(map(lambda x:x['summoner_rank'],html_summoner_json))

summoner = zip(summoner_name,summoner_des,summoner_rank)
return summoner
#print(html_summoner_json)

def getMinginfo():
MING = 'http://pvp.qq.com/web201605/js/ming.json'
html_ming = requests.get(MING,headers=JSONHEAD)
html_ming = requests.get(MING)
html_ming_json = html_ming.json()
#获取铭文列表
ming_type = list(map(lambda x:x['ming_type'],html_ming_json))
ming_name = list(map(lambda x:x['ming_name'],html_ming_json))
ming_grade = list(map(lambda x:x['ming_grade'],html_ming_json))
ming_des = list(map(lambda x:x['ming_des'],html_ming_json))

return zip(ming_type,ming_name,ming_grade,ming_des)

def getHerotype():
URL = 'http://pvp.qq.com/web201605/herolist.shtml'
HTML = requests.get(URL,headers=JSONHEAD)
REGEX_number = re.compile(' data-type="(\d)">',re.S)
REGEX_type = re.compile('<label>([\u4e00-\u9fa5]{2})</label>',re.S)
HTML.encoding = 'GBK'
data = HTML.text
number = REGEX_number.findall(data)
herotype = REGEX_type.findall(data)
type_dict = dict(zip(number,herotype))
return type_dict

def main():
'''
#召唤师技能
fp = open(os.getcwd()+'\\summoner.csv','w+')
for item in list(getSummonerinfo()):
item = str(item)
item = item.replace('(','')
item = item.replace(')','')
item = item.replace("'",'')
item = item.replace("",'')
item = item.replace("",'')
item = item.replace("",' ')
fp.writelines(str(item)+'\n')
fp.close()
#铭文
fp = open(os.getcwd()+'\\ming.csv','w+')
for item in list(getMinginfo()):
item = str(item)
item = item.replace('(','')
item = item.replace(')','')
item = item.replace("'",'')
fp.writelines(str(item)+'\n')
fp.close()
#英雄
fp = open(os.getcwd()+'\\hero.csv','w+')
for item in list(getHeroinfo()):
item = str(item)
item = item.replace('(','')
item = item.replace(')','')
item = item.replace("'",'')
fp.writelines(str(item)+'\n')
fp.close()'''
#装备
fp = open(os.getcwd()+'\\item.csv','w+',encoding='utf-8')
for item in list(getIteminfo()):
item = str(item)
item = item.replace('(','')
item = item.replace(')','')
item = item.replace("'",'')
item = item.replace("",'')
item = item.replace("",'')
item = item.replace(" ",' ')
fp.writelines(item+'\n')
#print(item)
fp.close()

if __name__ == '__main__':
main()

使用Python写文件的时候，或者将网络数据流写入到本地文件的时候，大部分情况下会遇到：UnicodeEncodeError: 'gbk' codec can't encode character '\xa0' in position ... 这个问题。有时候在用正则表达式时也会遇到另一类的编码问题

‘can't use a string pattern on a bytes-like object’

参见我的博客 http://blog.csdn.net/zoulonglong/article/details/78547191，网络上有很多类似的文件讲述如何解决这个问题，但是无非就是encode，decode相关的，这是导致该问题出现的真正原因吗？不是的。很多时候，我们使用了decode和encode，试遍了各种编码，utf8，utf-8,gbk,gb2312等等，该有的编码都试遍了，可是编译的时候仍然出现：
UnicodeEncodeError: 'gbk' codec can't encode character '\xa0' in position XXX。崩溃了。

    在windows下面编写python脚本，编码问题很严重。尤其是Python3，Python2这种问题能少一点。在Python3里，有几点关于编码的常识：

1、字符就是unicode字符，字符串就是unicode字符数组

2、str转bytes叫encode，bytes转str叫decode

    将网络数据流写入文件时时，我们会遇到几个编码：

    1： #encoding='XXX' 这里(也就是python文件第一行的内容)的编码是指该python脚本文件本身的编码，无关紧要。只要XXX和文件本身的编码相同就行了。比如notepad++ "格式"菜单里面里可以设置各种编码，这时需要保证该菜单里设置的编码和encoding XXX相同就行了，不同的话会报错

    2：网络数据流的编码比如获取网页，那么网络数据流的编码就是网页的编码。需要使用decode解码成unicode编码。

    3：目标文件的编码要将网络数据流的编码写入到新文件，那么我么需要指定新文件的编码。写文件代码如：

f.write(txt)
那么txt是一个字符串，它是通过decode解码过的字符串。关键点就要来了：目标文件的编码是导致标题所指问题的罪魁祸首。如果我们打开一个文件：

f = open("out.html","w")
在windows下面，新文件的默认编码是gbk，这样的话，python解释器会用gbk编码去解析我们的网络数据流txt，然而txt此时已经是decode过的unicode编码，这样的话就会导致解析不了，出现上述问题。解决的办法就是，改变目标文件的编码：

f = open("out.html","w",encoding='utf-8')
这样，问题将不复存在。
最后，附上一些常用的和中文有关的编码的名称，分别赋值给encoding，就可以看到不同的效果了：

编码名称	用途
utf8	所有语言
gbk	简体中文
gb2312	简体中文
gb18030	简体中文
big5	繁体中文
big5hkscs	繁体中文

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航