您的位置:首页 > 其它

Androguard的部分源码(一)——androaxml.py

2016-08-31 17:05 127 查看
废话少说,上代码。

option_0 = { 'name' : ('-i', '--input'), 'help' : 'filename input (APK or android\'s binary xml)', 'nargs' : 1 }
option_1 = { 'name' : ('-o', '--output'), 'help' : 'filename output of the xml', 'nargs' : 1 }
option_2 = { 'name' : ('-v', '--version'), 'help' : 'version of the API', 'action' : 'count' }
options = [option_0, option_1, option_2]

def main(options, arguments):
if options.input != None:
buff = ""

ret_type = androconf.is_android(options.input) #读取文件头判断文件类型
if ret_type == "APK":
a = apk.APK(options.input)
buff = a.get_android_manifest_xml().toprettyxml(encoding="utf-8")
elif ".xml" in options.input:
ap = apk.AXMLPrinter(read(options.input))
buff = minidom.parseString(ap.get_buff()).toprettyxml(encoding="utf-8")
else:
print "Unknown file type"
return

if options.output != None: #创建输出文件
fd = codecs.open(options.output, "w", "utf-8")
fd.write( buff )
fd.close()
else: #否则输出到屏幕
print buff

elif options.version != None:
print "Androaxml version %s" % androconf.ANDROGUARD_VERSION

if __name__ == "__main__":
parser = OptionParser()
for option in options:
param = option['name']
del option['name']
parser.add_option(*param, **option)

options, arguments = parser.parse_args()
sys.argv[:] = arguments
main(options, arguments)

这是androaxml.py的全部源码。几个内容
第一,参数。一个input,可以是apk,或者AndroidManfest.xml。一个output,这是指定的输出文件名,如果不指定输出文件名,则输出到屏幕。

第二,如果为apk,则使用APK()解析

def get_android_manifest_xml(self):
"""
Return the xml object which corresponds to the AndroidManifest.xml file

:rtype: object
"""
try:
return self.xml["AndroidManifest.xml"]
except KeyError:
return None

如果是AndroidManfest.xml,则使用AXMLPrinter
而在APK.__init__函数中有这样一段

if zipmodule == 0:
self.zip = ChilkatZip(self.__raw)
elif zipmodule == 2:
from androguard.patch import zipfile
self.zip = zipfile.ZipFile(StringIO.StringIO(self.__raw), mode=mode)
else:
import zipfile
self.zip = zipfile.ZipFile(StringIO.StringIO(self.__raw), mode=mode)

for i in self.zip.namelist():
if i == "AndroidManifest.xml":
self.axml[i] = AXMLPrinter(self.zip.read(i))
try:
self.xml[i] = minidom.parseString(self.axml[i].get_buff())
except:
self.xml[i] = None

对apk文件利用ChilkatZip或者ZipFile进行解压,然后从解压后的文件列表当中遍历获取AndroidManfest.xml,再对AndroidManfest.xml
调用AXMLPrinter,所以核心的处理在AXMLPrinter当中。

AXMLPrinter则是用AXMLParser对文件进行解析。

所以处理流程就清晰了

APK: 生成APK class实例 ——> 解压文件 ——> 遍历获取AndroidManfest.xml ——>
AXMLPrinter实例 ——>
AXMLParser实例解析

XML: AXMLPrinter实例 ——>
AXMLParser实例解析

class AXMLParser(object):
def __init__(self, raw_buff):
self.reset()

self.valid_axml = True
self.buff = bytecode.BuffHandle(raw_buff)

axml_file = unpack('<L', self.buff.read(4))[0] #读取文件头

if axml_file == CHUNK_AXML_FILE: #判断文件头
self.buff.read(4)

self.sb = StringBlock(self.buff) #字符串池

self.m_resourceIDs = []
self.m_prefixuri = {}
self.m_uriprefix = {}
self.m_prefixuriL = []

self.visited_ns = []
else:
self.valid_axml = False
androconf.warning("Not a valid xml file")AXMLParser.buff结构



self.__buff保存内容

self.__idx保存已解析的长度,也就是下次解析的起点

class AXMLPrinter(object):
def __init__(self, raw_buff):
self.axml = AXMLParser(raw_buff) #实例化AXMLParser
self.xmlns = False

self.buff = u''
#主处理逻辑
while True and self.axml.is_valid():
_type = self.axml.next()
# print "tagtype = ", _type

if _type == START_DOCUMENT:
self.buff += u'<?xml version="1.0" encoding="utf-8"?>\n'
elif _type == START_TAG:
self.buff += u'<' + self.getPrefix(self.axml.getPrefix()) + self.axml.getName() + u'\n'
self.buff += self.axml.getXMLNS()

for i in range(0, self.axml.getAttributeCount()):
self.buff += "%s%s=\"%s\"\n" % (self.getPrefix(
self.axml.getAttributePrefix(i)), self.axml.getAttributeName(i), self._escape(self.getAttributeValue(i)))

self.buff += u'>\n'

elif _type == END_TAG:
self.buff += "</%s%s>\n" % (self.getPrefix(self.axml.getPrefix()), self.axml.getName())

elif _type == TEXT:
self.buff += "%s\n" % self.axml.getText()

elif _type == END_DOCUMENT:
breakAXMLParser实例化完成后进入主处理逻辑
在前一篇文章反编译编译后的AndroidManifest
当中也有一段类似的处理逻辑,实现大同小异,都是读取tag,判断是什么chunk,然后然后处理,可以对比一下。

def next(self):
self.doNext()
return self.m_eventnext函数调用doNext
def doNext(self):
if self.m_event == END_DOCUMENT: #文件结束
return

event = self.m_event

self.reset()
while True:
chunkType = -1

# Fake END_DOCUMENT event.
if event == END_TAG: #tag结束标志
pass

# START_DOCUMENT
if event == START_DOCUMENT: #Start Tag Chunk
chunkType = CHUNK_XML_START_TAG
else:
if self.buff.end(): #文件是否结束
self.m_event = END_DOCUMENT
break
chunkType = unpack('<L', self.buff.read(4))[0] #读取后四位

if chunkType == CHUNK_RESOURCEIDS: #ResourceId Chunk
chunkSize = unpack('<L', self.buff.read(4))[0]
# FIXME
if chunkSize < 8 or chunkSize % 4 != 0: #长度是否合法
androconf.warning("Invalid chunk size")

for i in range(0, chunkSize / 4 - 2):
self.m_resourceIDs.append(unpack('<L', self.buff.read(4))[0])

continue

# FIXME
if chunkType < CHUNK_XML_FIRST or chunkType > CHUNK_XML_LAST: #无法识别的tag
androconf.warning("invalid chunk type")

# Fake START_DOCUMENT event.
if chunkType == CHUNK_XML_START_TAG and event == -1: #第一次读到Start Tag Chunk时,event为-1
self.m_event = START_DOCUMENT #将event设置为START_DOCUMENT之后退出
break #返回到主处理逻辑

self.buff.read(4) # /*chunkSize*/
lineNumber = unpack('<L', self.buff.read(4))[0]
self.buff.read(4) # 0xFFFFFFFF

if chunkType == CHUNK_XML_START_NAMESPACE or chunkType == CHUNK_XML_END_NAMESPACE:
if chunkType == CHUNK_XML_START_NAMESPACE: #Start Namespace Chunk
prefix = unpack('<L', self.buff.read(4))[0]
uri = unpack('<L', self.buff.read(4))[0]

self.m_prefixuri[prefix] = uri
self.m_uriprefix[uri] = prefix
self.m_prefixuriL.append((prefix, uri))
self.ns = uri
else: #End Namespace Chunk
self.ns = -1
self.buff.read(4)
self.buff.read(4)
(prefix, uri) = self.m_prefixuriL.pop()
#del self.m_prefixuri[ prefix ]
#del self.m_uriprefix[ uri ]

continue

self.m_lineNumber = lineNumber

if chunkType == CHUNK_XML_START_TAG: #第二次读取到Start Tag Chunk,此时event为START_DOCUMENT
self.m_namespaceUri = unpack('<L', self.buff.read(4))[0]
self.m_name = unpack('<L', self.buff.read(4))[0]

# FIXME
self.buff.read(4) # flags

attributeCount = unpack('<L', self.buff.read(4))[0]
self.m_idAttribute = (attributeCount >> 16) - 1
attributeCount = attributeCount & 0xFFFF
self.m_classAttribute = unpack('<L', self.buff.read(4))[0]
self.m_styleAttribute = (self.m_classAttribute >> 16) - 1

self.m_classAttribute = (self.m_classAttribute & 0xFFFF) - 1

for i in range(0, attributeCount * ATTRIBUTE_LENGHT):
self.m_attributes.append(unpack('<L', self.buff.read(4))[0])

for i in range(ATTRIBUTE_IX_VALUE_TYPE, len(self.m_attributes), ATTRIBUTE_LENGHT):
self.m_attributes[i] = self.m_attributes[i] >> 24

self.m_event = START_TAG
break

if chunkType == CHUNK_XML_END_TAG:
self.m_namespaceUri = unpack('<L', self.buff.read(4))[0]
self.m_name = unpack('<L', self.buff.read(4))[0]
self.m_event = END_TAG
break

if chunkType == CHUNK_XML_TEXT:
self.m_name = unpack('<L', self.buff.read(4))[0]

# FIXME
self.buff.read(4)
self.buff.read(4)

self.m_event = TEXT
break

doNext函数很长。关注的重点在while循环中。当读取到ResourceId Chunk和Namespace Chunk
则continue。而第一次读到Start Tag Chunk的时候则会退出,返回到AXMLPrinter的主处理逻辑当中。

再看一下AXMLPrinter

while True and self.axml.is_valid():
_type = self.axml.next()
# print "tagtype = ", _type

if _type == START_DOCUMENT:
self.buff += u'<?xml version="1.0" encoding="utf-8"?>\n'
elif _type == START_TAG:
self.buff += u'<' + self.getPrefix(self.axml.getPrefix()) + self.axml.getName() + u'\n'
self.buff += self.axml.getXMLNS()

for i in range(0, self.axml.getAttributeCount()):
self.buff += "%s%s=\"%s\"\n" % (self.getPrefix(
self.axml.getAttributePrefix(i)), self.axml.getAttributeName(i), self._escape(self.getAttributeValue(i)))

self.buff += u'>\n'

elif _type == END_TAG:
self.buff += "</%s%s>\n" % (self.getPrefix(self.axml.getPrefix()), self.axml.getName())

elif _type == TEXT:
self.buff += "%s\n" % self.axml.getText()

elif _type == END_DOCUMENT:
breakself.buff是准备写入解析后的xml文件的字符串。第一次执行next函数,成功读取ResourceId Chunk和Namespace Chunk
之后遇到Start Tag Chunk,修改m_event之后退出。在buff字符串写入u'<?xml version="1.0" encoding="utf-8"?>\n'

然后继续执行next函数,之后再遇到Start Tag Chunk时不会直接退出,而是执行相应的解析操作。

从理论上将字符串池、ResourceId Chunk和Namespace Chunk都位于第一个Start Tag Chunk之前

而这些chunk也并不会直接出现在解析后的xml文件中。

所以先将他们解析,放入准备好的变量容器。第一次遇到Start Tag Chunk说明之前的内容已经处理完毕了,之后就可以将

Start Tag Chunk解析后的结果写入结果字符串。

当结果字符串构造完毕之后,输出到文件或者屏幕即可。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: