您的位置:首页 > 编程语言 > C语言/C++

获取 文件 后缀 c++

2018-01-30 16:40 316 查看
正常情况下可以通过SHGetFileInfoA 或 PathFindExtension 获取文件的类型,当文件无后缀时就需要想别的方法了。

以下以DOC和DOCX文件类型为例子:

正常情况生成的doc文件其二进制数据如下:



正常情况docx文件其二进制数据如下:



通过对比可以看到doc和docx的二进制数据开头都不一样,因此在没有文件后缀的情况下可以从该方面来判断文件类型。

以下是c++代码:

{
char* files = "c:\\sdfsdf";
FILE* file;
file = fopen(files, "rb");
enum{ NONE, DOC, DOCX };
int fileType = 0;
if (file)
{
unsigned char  buff[32] = { 0 };
size_t length = fread(buff, 1, 10, file);
if (length> 4)
{
if (buff[0] == 0x50 && buff[1] == 0x4B && buff[2] == 0x03 && buff[3] == 0x04)//正常创建docx
{
fileType = DOCX; //
}
else if (buff[0] == 0xD0 && buff[1] == 0xCF && buff[2] == 0x11 && buff[3] == 0xE0)  //正常创建doc
{
fileType = DOC; //
}
else if (buff[0] == 0x7B && buff[1] == 0x5C && buff[2] == 0x72 && buff[3] == 0x74)  //非正常创建doc
{
fileType = DOC; //
}
}
fclose(file);
}
}

注: 通过查看已知文件的二进制数据可以判断相应类型的文件

第一次补充:

通过上面简单的二进制判断可以处理已知文件,但准确率不高。要提高精确度可以使用StructuredStorageHeader结构读取文件流头部信息,再结合具体信息,从而提高判断的精确度。

#include <Shlwapi.h>
#include <Shellapi.h>

typedef unsigned long ULONG;    // 4 Bytes
typedef unsigned short USHORT;  // 2 Bytes
typedef short OFFSET;           // 2 Bytes
typedef ULONG SECT;             // 4 Bytes
typedef ULONG FSINDEX;          // 4 Bytes
typedef USHORT FSOFFSET;        // 2 Bytes
//typedef USHORT WCHAR;           // 2 Bytes
typedef ULONG DFSIGNATURE;      // 4 Bytes
typedef unsigned char BYTE;     // 1 Byte
typedef unsigned short WORD;    // 2 Bytes
typedef unsigned long DWORD;    // 4 Bytes
//typedef ULONG SID;              // 4 Bytes
typedef GUID CLSID;             // 16 Bytes

struct StructuredStorageHeader { // [offset from start (bytes), length (bytes)]
BYTE _abSig[8];             // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1,
// 0x1a, 0xe1} for current version
CLSID _clsid;               // [08H,16] reserved must be zero (WriteClassStg/
// GetClassFile uses root directory class id)
USHORT _uMinorVersion;      // [18H,02] minor version of the format: 33 is
// written by reference implementation
USHORT _uDllVersion;        // [1AH,02] major version of the dll/format: 3 for
// 512-byte sectors, 4 for 4 KB sectors
USHORT _uByteOrder;         // [1CH,02] 0xFFFE: indicates Intel byte-ordering
USHORT _uSectorShift;       // [1EH,02] size of sectors in power-of-two;
// typically 9 indicating 512-byte sectors
USHORT _uMiniSectorShift;   // [20H,02] size of mini-sectors in power-of-two;
// typically 6 indicating 64-byte mini-sectors
USHORT _usReserved;         // [22H,02] reserved, must be zero
ULONG _ulReserved1;         // [24H,04] reserved, must be zero
FSINDEX _csectDir;          // [28H,04] must be zero for 512-byte sectors,
// number of SECTs in directory chain for 4 KB
// sectors
FSINDEX _csectFat;          // [2CH,04] number of SECTs in the FAT chain
SECT _sectDirStart;         // [30H,04] first SECT in the directory chain
DFSIGNATURE _signature;     // [34H,04] signature used for transactions; must
// be zero. The reference implementation
// does not support transactions
ULONG _ulMiniSectorCutoff;  // [38H,04] maximum size for a mini stream;
// typically 4096 bytes
SECT _sectMiniFatStart;     // [3CH,04] first SECT in the MiniFAT chain
FSINDEX _csectMiniFat;      // [40H,04] number of SECTs in the MiniFAT chain
SECT _sectDifStart;         // [44H,04] first SECT in the DIFAT chain
FSINDEX _csectDif;          // [48H,04] number of SECTs in the DIFAT chain
SECT _sectFat[109];         // [4CH,436] the SECTs of first 109 FAT sectors
};
#pragma warning(disable:4996)

void main(int argc, char* argv[])
{
{
#ifdef _DEBUG
char* files = "sss";
#else
char* files = argv[1];
#endif
FILE* file;
file = fopen(files, "rb");
enum{ NONE, DOC, DOCX };
int fileType = 0;
if (file)
{
unsigned char  buff[512] = { 0 };
StructuredStorageHeader Head;
int lenth = sizeof(StructuredStorageHeader);
size_t length = fread(buff, 1, lenth, file);
if (length == lenth)
{
memcpy(&Head, buff, lenth);
CLSID docID = { 0 };
if (docID == Head._clsid)
{
fileType = DOC;
}
else
{
CLSID docxID = { 0 };
docxID.Data1 = 0x4e870000;
docxID.Data2 = 0x40e2;
if (docxID == Head._clsid) //wps
{
fileType = DOCX;
}
else
{
docxID.Data1 = 8;
docxID.Data2 = 0x21;
docxID.Data3 = 0x4fbd;
docxID.Data4[0] = 92;
docxID.Data4[1] = 82;
docxID.Data4[2] = 0x9d;
docxID.Data4[3] = 1;
docxID.Data4[4] = 0;
docxID.Data4[5] = 0;
docxID.Data4[6] = 0x29;
docxID.Data4[7] = 7;
if (docxID == Head._clsid) //office
{
fileType = DOCX;
}
}
}
if (!fileType && buff[0] == 0x7B && buff[1] == 0x5C && buff[2] == 0x72 && buff[3] == 0x74)  //非正常创建doc
{
fileType = DOC; //
}
printf("%08x-%04x-%x-%02x%02x%02x%02x%02x%02x", Head._clsid.Data1, Head._clsid.Data2, Head._clsid.Data3,
Head._clsid.Data4[2], Head._clsid.Data4[3],
Head._clsid.Data4[4], Head._clsid.Data4[5],
Head._clsid.Data4[6], Head._clsid.Data4[7]);
}
fclose(file);
}
return;
}
}


参考:

https://en.wikipedia.org/wiki/Compound_File_Binary_Format --StructuredStorageHeader

第二次补充:
在实验过程中,会误判ppt为doc,在查看了ppt文件对应的二进制数据口,重新优化了代码:

FILE* file;
file = fopen(files, "rb");
enum{ NONE, DOC, DOCX };
int fileType = 0;
if (file)
{
unsigned char  buff[512] = { 0 };
StructuredStorageHeader Head;
int lenth = sizeof(StructuredStorageHeader);
size_t length = fread(buff, 1, lenth, file);
if (length == lenth)
{
memcpy(&Head, buff, lenth);
CLSID docID = { 0 };
if (docID == Head._clsid)
{
if(buff[60] ==0x31||  //office doc   //区分 ppt doc
(buff[60] == 2 && buff[80] == 0xFF))    //wps doc
fileType = DOC;
}
else
{
if (Head._abSig[0] == 'P' && Head._abSig[1] == 'K')
{
CLSID docxID = { 0 };
docxID.Data1 = 0x4e870000;
docxID.Data2 = 0x40e2;
if (docxID == Head._clsid) //wps
{
fileType = DOCX;
}
else
{
docxID.Data1 = 8;
docxID.Data2 = 0x21;
if (docxID.Data1 == Head._clsid.Data1 && docxID.Data2 == Head._clsid.Data2) //office
{
fileType = DOCX;
}
}
}
}
if (!fileType && buff[0] == 0x7B && buff[1] == 0x5C && buff[2] == 0x72 && buff[3] == 0x74)  //非正常创建doc
{
fileType = DOC; //
}
printf("%08x-%04x-%x-%02x%02x%02x%02x%02x%02x", Head._clsid.Data1, Head._clsid.Data2, Head._clsid.Data3,
Head._clsid.Data4[2], Head._clsid.Data4[3],
Head._clsid.Data4[4], Head._clsid.Data4[5],
Head._clsid.Data4[6], Head._clsid.Data4[7]);
}
fclose(file);
}
printf("%s", fileType == 1? "DOC":fileType == 2? "DOCX":"NOLL");



第三次补充:
由于word支持打开的子版本太多,导致有许多可以用word打开的文件没有成功判断,现更改判断方法:
//BINARY查找字符串
#define _FLY_STRING_FindBitSub(fullStr, fullstrLent, subStr, subStrLen, rstValue)\
{\
int i = 0, j = 0; \
while (i < fullstrLent && j < subStrLen)\
{\
if (*(fullStr + i) == *(subStr + j))\
{\
j++; \
}\
else\
j = 0; \
i++; \
}\
rstValue = (subStrLen == j); \
}
void main(int argc, char* argv[])
{
{

char* files = argv[1];
FILE* file;
file = fopen(files, "rb");
enum{ NONE, DOC, DOCX };
int fileType = 0;
if (file)
{
unsigned char  buff[512] = { 0 };
int lenth = 512;
size_t length = fread(buff, 1, lenth, file);
if (length == lenth)
{
if (buff[0] == 0xD0 && buff[1] == 0xCF && buff[2] == 0x11 && buff[3] == 0xE0) //doc ppt xml
{
lenth = 32;
length = fread(buff, 1, lenth, file);
if (length == lenth)
{
bool rst = false;
unsigned char wpsDoc[] = {0xFD, 0XFF, 0XFF, 0XFF, 0X05, 0X00, 0,0,0XFE,0XFF,0XFF,0XFF,0X04};
_FLY_STRING_FindBitSub(buff, length, wpsDoc, 13, rst);
if (rst) //wps doc
{
fileType = DOC;
}
else
{
int seekLen = sizeof(unsigned char)* 512;
int rst = fseek(file, -seekLen, SEEK_END);
if (!rst)
{
size_t length = fread(buff, 1, seekLen, file);
bool rst = false;
_FLY_STRING_FindBitSub(buff, length, "Word.Document", strlen("Word.Document"), rst);
if (rst) //office doc
{
fileType = DOC;
}
}
}
}
}
else
{
if (buff[0] == 'P' && buff[1] == 'K') //docx
{
int seekLen = sizeof(unsigned char)* 512;
int rst = fseek(file, -seekLen, SEEK_END);
if (!rst)
{
size_t length = fread(buff, 1, seekLen, file);
bool rst = false;
_FLY_STRING_FindBitSub(buff, length, "ord", strlen("ord"), rst); //word Word ord/
if (rst) //wps office
{
fileType = DOCX;
}
}
}
}
if (!fileType && buff[0] == 0x7B && buff[1] == 0x5C && buff[2] == 0x72 && buff[3] == 0x74)  //非正常创建doc
{
fileType = DOC; //
}
}
fclose(file);
}
printf("%s", fileType == 1? "DOC":fileType == 2? "DOCX":"NOLL");
return;
}
}

参考:
https://www.cnblogs.com/WangAoBo/p/6366211.html --文件头

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  文件后缀 判断