您的位置:首页 > 运维架构 > 网站架构

一个网站下载程序遇到的问题

2012-07-01 23:55 375 查看
贴的这些代码,是我一个构想,代码太长了,没全贴,就是把别人的网站下载下来,替换上自己的内容,简单的说就是网站下载程序

这样,我就可以快速建站,我准备先拿一个具体站点做实验,然后做成通用的

在下载别的站点网页的时候,发现以下问题,大家能否解答下?

1、发现有很多网址现在不规范,造成在识别时候,写很多种情况,也没法正确识别,例如abc.aspx?id=1&&c=1,这还算好识别的,还有/do?abc=0这种URL基本就没什么规律了

2、正则表达式效率很高,但是编写实在复杂,花很久才能调试对一个正则表达式,有什么抓取内容的替代方案?

3、程序运行后,效率不高,处理一个实例需要不少时间,我用.net内存分析工具,发现内存增长很快,不知道是我电脑不行还是什么原因,1小时左右偶然会内存溢出

且有大量string字符串在托管堆上无法回收?

大家分析下代码?

public static void RecURLDone(int urlID)

{

string sql = "update SiteInfo set DownloadDone=1 where ID="+urlID;

//DBUtility db2 = new DBUtility();

IDataBase mydb;

DataBaseFactory dbFactory = new DataBaseFactory();

mydb = dbFactory.MakeDataBase(EnvConfig.getSystemDataBase(SystemDataBaseEnum.Oracle));

mydb.OperateDB2(sql);

}

public string[] predoURL(string[] UrlArry,string CurrUrl)

{

string[] urlArry=new string[UrlArry.Length];

urlArry=UrlArry;

for (int i = 0; i < urlArry.Length; i++)

{

if (urlArry[i] != "")

{

if (urlArry[i] == "#")

{

urlArry[i] = "";

}

else

{

string str = urlArry[i].ToLower().Trim();

string firstLetter = str.Substring(0, 1);

if (firstLetter == "/")

{

//html_sb = html_sb.Replace("\"" + urlArry[i] + "\"", "\"" + getCurrUrlRoot(CurrUrl) + urlArry[i] + "\"");

//html_sb = html_sb.Replace("'" + urlArry[i] + "'", "'" + getCurrUrlRoot(CurrUrl) + urlArry[i] + "'");

urlArry[i] = getCurrUrlRoot(CurrUrl) + urlArry[i];

//urlArry[i] = urlArry[i].Replace("//", "/");

}

else

{

if (!str.Contains("http://"))

{

if (firstLetter != "/")

{

//html_sb = html_sb.Replace("\"" + urlArry[i] + "\"", "\"" + getCurrUrlRoot(CurrUrl) + "/" + urlArry[i] + "\"");

//html_sb = html_sb.Replace("'" + urlArry[i] + "'", "'" + getCurrUrlRoot(CurrUrl) + "/" + urlArry[i] + "'");

urlArry[i] = getCurrUrlRoot(CurrUrl) + "/" + urlArry[i];

}

}

}

}

}

}

return urlArry;

}

public string predoURL(string UrlArry, string CurrUrl)

{

//string[] urlArry = new string[UrlArry.Length];

string urlArry = UrlArry;

if (urlArry != "")

{

if (urlArry == "#")

{

urlArry = "";

}

else

{

string str = urlArry;

string firstLetter = str.Substring(0, 1);

if (firstLetter == "/")

{

urlArry = getCurrUrlRoot(CurrUrl) + urlArry;

}

else

{

if (!str.Contains("http://"))

{

if (firstLetter != "/")

{

urlArry= getCurrUrlRoot(CurrUrl) + "/" + urlArry;

}

}

}

}

}

if (urlArry.Contains("http:// www"))

{

int pp = 0; }

return urlArry;

}

private string getCurrUrlRoot(string _url)

{

string url = _url;

string url_front = "";

url = url.Replace("http://", "");

if (url.Contains("/"))

{

int tt = url.IndexOf("/");

url_front = url.Substring(0, tt);

//url_back = url.Substring(tt + 1);

}

else

{

url_front = url;

}

if (url_front.Contains(" "))

{

int pp = 0;

}

return "http://"+url_front;

}

public string[] analysisURL(string _url,string currURL)

{

string AppRoot = EnvConfig.AppPath;

string url = _url;

url=url.Trim().ToLower();

//url=url;

url = predoURL(url, currURL);

//url = url.Replace("http://", "");

int tmp_position = 0;

int tmp_length = 0;

string tmp_str = "";

string url_front="";

string url_back="";

string root_site="";

string saveFileName = "";

string newUrl ="";

int siteType = 2000;

int sitePos = 2000;

url = url.Replace("http://", "");

if (url.Contains("/"))

{

int tt = url.IndexOf("/");

url_front = url.Substring(0, tt);

url_back = url.Substring(tt);

}

else

{

url_front = url;

}

//tmp_position = url.IndexOf(EnvConfig.SiteTypeList[siteType]);

//tmp_length = EnvConfig.SiteTypeList[siteType].Length;

// url_front=url.Substring(0,tmp_position+tmp_length);

// url_back=url.Substring(tmp_position+tmp_length);

root_site = url_front.Replace("http://", "");

//root_site=url_front.Replace("")

string[] temp_dirs=null;

string t_dirs = "/";//构造存储目录变量

if (url_back.Length > 2 && url_back.Contains("/"))

{

temp_dirs = Regex.Split(url_back,"/");

//url_back.Split(

}

//如果URL中没有文件名,保存为 index.htm

bool _findFileName = true;//发现URL包含文件名标志

bool _findQuestionMark = false;

if (_url.Contains("?"))

{

_findQuestionMark = true;

}

if ((temp_dirs != null) && (temp_dirs[temp_dirs.Length - 1]!=""))

{

saveFileName = temp_dirs[temp_dirs.Length-1];

}

else

{ saveFileName = "index.htm"; _findFileName = false; }//如果未发现文件名,则文件名为空

bool chg=false;

for (int i = 0; i < EnvConfig.downLoadFileTypeList.Length; i++)

{

if (saveFileName.Contains(EnvConfig.downLoadFileTypeList[i])) chg = true;

}

if ((!chg)&&(_findQuestionMark==false))

{

tmp_position = saveFileName.LastIndexOf(".");

if (tmp_position >= 0)

{

saveFileName = saveFileName.Substring(0, tmp_position) + ".htm";

}

else

{

saveFileName += ".htm";

}

}

if ((!chg) && (_findQuestionMark == true) && url_front.Contains(EnvConfig.MinhttpURL))

{

string _res = "";

string[] _fileName ;

_fileName = url_back.Split('?');

string _frontFileName = _fileName[0];

string _backFileName = _fileName[1];

int p1 = _frontFileName.IndexOf(".");

if ((_frontFileName != "")&&(p1>0))

{

_res = _frontFileName.Substring(0, p1);

}

string[] tag;

if (_backFileName.Contains("&"))

{

tag = Regex.Split(_backFileName, "&");

for (int i = 0; i < tag.Length; i++)

{

string[] tmp;

tmp = Regex.Split(tag[i], "=");

//HttpUtility.UrlEncode()

if (tmp.Length >= 2)

{

_res += "_" + HttpUtility.UrlEncode(tmp[0]) + "_" + HttpUtility.UrlEncode(tmp[1]);

}

else

{ _res += tag[i]; }

}

}

else

{

string[] tmp;

if (_backFileName.Contains("="))

{

tmp = Regex.Split(_backFileName, "=");

_res += "_" + HttpUtility.UrlEncode(tmp[0]) + "_" + HttpUtility.UrlEncode(tmp[1]);

}

else { _res += _backFileName; }

}

_res += ".htm";

saveFileName = _res;

}

int nCount = 0;

if (temp_dirs != null)

{

for (int i = 0; i < temp_dirs.Length; i++)

{

if (temp_dirs[i] != "" && i != (temp_dirs.Length - 1))

{

nCount++;

t_dirs = t_dirs + "/" + temp_dirs[i] + "/";

}

}

}

t_dirs = root_site + t_dirs;

//控制最小符合URL,防止范围过大

//if (url_front.Contains(EnvConfig.MinhttpURL))

//{ chg = true; }

string desKey=@"南京快餐,南京快餐外送,江宁快餐,江宁快餐配送,南京食堂承包,南京职工食堂承包,http://www.njyhkc.com";

//替换用户关键字

if (EnvConfig.replace_DIR != null)

{

int ccc=EnvConfig.replace_DIR.Length/ EnvConfig.replace_DIR.Rank;

string[,] aaa = EnvConfig.replace_DIR;

for (int i = 0; i < ccc; i++)

{

t_dirs = t_dirs.Replace(aaa[i,0],aaa[i,1]);

}

}

if (EnvConfig.replace_URL != null)

{

int ccc = EnvConfig.replace_URL.Length / EnvConfig.replace_URL.Rank;

string[,] aaa = EnvConfig.replace_URL;

for (int i = 0; i < ccc; i++)

{

saveFileName = saveFileName.Replace(aaa[i, 0], aaa[i, 1]);

}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐