一个网站下载程序遇到的问题
2012-07-01 23:55
375 查看
贴的这些代码,是我一个构想,代码太长了,没全贴,就是把别人的网站下载下来,替换上自己的内容,简单的说就是网站下载程序
这样,我就可以快速建站,我准备先拿一个具体站点做实验,然后做成通用的
在下载别的站点网页的时候,发现以下问题,大家能否解答下?
1、发现有很多网址现在不规范,造成在识别时候,写很多种情况,也没法正确识别,例如abc.aspx?id=1&&c=1,这还算好识别的,还有/do?abc=0这种URL基本就没什么规律了
2、正则表达式效率很高,但是编写实在复杂,花很久才能调试对一个正则表达式,有什么抓取内容的替代方案?
3、程序运行后,效率不高,处理一个实例需要不少时间,我用.net内存分析工具,发现内存增长很快,不知道是我电脑不行还是什么原因,1小时左右偶然会内存溢出
且有大量string字符串在托管堆上无法回收?
大家分析下代码?
public static void RecURLDone(int urlID)
{
string sql = "update SiteInfo set DownloadDone=1 where ID="+urlID;
//DBUtility db2 = new DBUtility();
IDataBase mydb;
DataBaseFactory dbFactory = new DataBaseFactory();
mydb = dbFactory.MakeDataBase(EnvConfig.getSystemDataBase(SystemDataBaseEnum.Oracle));
mydb.OperateDB2(sql);
}
public string[] predoURL(string[] UrlArry,string CurrUrl)
{
string[] urlArry=new string[UrlArry.Length];
urlArry=UrlArry;
for (int i = 0; i < urlArry.Length; i++)
{
if (urlArry[i] != "")
{
if (urlArry[i] == "#")
{
urlArry[i] = "";
}
else
{
string str = urlArry[i].ToLower().Trim();
string firstLetter = str.Substring(0, 1);
if (firstLetter == "/")
{
//html_sb = html_sb.Replace("\"" + urlArry[i] + "\"", "\"" + getCurrUrlRoot(CurrUrl) + urlArry[i] + "\"");
//html_sb = html_sb.Replace("'" + urlArry[i] + "'", "'" + getCurrUrlRoot(CurrUrl) + urlArry[i] + "'");
urlArry[i] = getCurrUrlRoot(CurrUrl) + urlArry[i];
//urlArry[i] = urlArry[i].Replace("//", "/");
}
else
{
if (!str.Contains("http://"))
{
if (firstLetter != "/")
{
//html_sb = html_sb.Replace("\"" + urlArry[i] + "\"", "\"" + getCurrUrlRoot(CurrUrl) + "/" + urlArry[i] + "\"");
//html_sb = html_sb.Replace("'" + urlArry[i] + "'", "'" + getCurrUrlRoot(CurrUrl) + "/" + urlArry[i] + "'");
urlArry[i] = getCurrUrlRoot(CurrUrl) + "/" + urlArry[i];
}
}
}
}
}
}
return urlArry;
}
public string predoURL(string UrlArry, string CurrUrl)
{
//string[] urlArry = new string[UrlArry.Length];
string urlArry = UrlArry;
if (urlArry != "")
{
if (urlArry == "#")
{
urlArry = "";
}
else
{
string str = urlArry;
string firstLetter = str.Substring(0, 1);
if (firstLetter == "/")
{
urlArry = getCurrUrlRoot(CurrUrl) + urlArry;
}
else
{
if (!str.Contains("http://"))
{
if (firstLetter != "/")
{
urlArry= getCurrUrlRoot(CurrUrl) + "/" + urlArry;
}
}
}
}
}
if (urlArry.Contains("http:// www"))
{
int pp = 0; }
return urlArry;
}
private string getCurrUrlRoot(string _url)
{
string url = _url;
string url_front = "";
url = url.Replace("http://", "");
if (url.Contains("/"))
{
int tt = url.IndexOf("/");
url_front = url.Substring(0, tt);
//url_back = url.Substring(tt + 1);
}
else
{
url_front = url;
}
if (url_front.Contains(" "))
{
int pp = 0;
}
return "http://"+url_front;
}
public string[] analysisURL(string _url,string currURL)
{
string AppRoot = EnvConfig.AppPath;
string url = _url;
url=url.Trim().ToLower();
//url=url;
url = predoURL(url, currURL);
//url = url.Replace("http://", "");
int tmp_position = 0;
int tmp_length = 0;
string tmp_str = "";
string url_front="";
string url_back="";
string root_site="";
string saveFileName = "";
string newUrl ="";
int siteType = 2000;
int sitePos = 2000;
url = url.Replace("http://", "");
if (url.Contains("/"))
{
int tt = url.IndexOf("/");
url_front = url.Substring(0, tt);
url_back = url.Substring(tt);
}
else
{
url_front = url;
}
//tmp_position = url.IndexOf(EnvConfig.SiteTypeList[siteType]);
//tmp_length = EnvConfig.SiteTypeList[siteType].Length;
// url_front=url.Substring(0,tmp_position+tmp_length);
// url_back=url.Substring(tmp_position+tmp_length);
root_site = url_front.Replace("http://", "");
//root_site=url_front.Replace("")
string[] temp_dirs=null;
string t_dirs = "/";//构造存储目录变量
if (url_back.Length > 2 && url_back.Contains("/"))
{
temp_dirs = Regex.Split(url_back,"/");
//url_back.Split(
}
//如果URL中没有文件名,保存为 index.htm
bool _findFileName = true;//发现URL包含文件名标志
bool _findQuestionMark = false;
if (_url.Contains("?"))
{
_findQuestionMark = true;
}
if ((temp_dirs != null) && (temp_dirs[temp_dirs.Length - 1]!=""))
{
saveFileName = temp_dirs[temp_dirs.Length-1];
}
else
{ saveFileName = "index.htm"; _findFileName = false; }//如果未发现文件名,则文件名为空
bool chg=false;
for (int i = 0; i < EnvConfig.downLoadFileTypeList.Length; i++)
{
if (saveFileName.Contains(EnvConfig.downLoadFileTypeList[i])) chg = true;
}
if ((!chg)&&(_findQuestionMark==false))
{
tmp_position = saveFileName.LastIndexOf(".");
if (tmp_position >= 0)
{
saveFileName = saveFileName.Substring(0, tmp_position) + ".htm";
}
else
{
saveFileName += ".htm";
}
}
if ((!chg) && (_findQuestionMark == true) && url_front.Contains(EnvConfig.MinhttpURL))
{
string _res = "";
string[] _fileName ;
_fileName = url_back.Split('?');
string _frontFileName = _fileName[0];
string _backFileName = _fileName[1];
int p1 = _frontFileName.IndexOf(".");
if ((_frontFileName != "")&&(p1>0))
{
_res = _frontFileName.Substring(0, p1);
}
string[] tag;
if (_backFileName.Contains("&"))
{
tag = Regex.Split(_backFileName, "&");
for (int i = 0; i < tag.Length; i++)
{
string[] tmp;
tmp = Regex.Split(tag[i], "=");
//HttpUtility.UrlEncode()
if (tmp.Length >= 2)
{
_res += "_" + HttpUtility.UrlEncode(tmp[0]) + "_" + HttpUtility.UrlEncode(tmp[1]);
}
else
{ _res += tag[i]; }
}
}
else
{
string[] tmp;
if (_backFileName.Contains("="))
{
tmp = Regex.Split(_backFileName, "=");
_res += "_" + HttpUtility.UrlEncode(tmp[0]) + "_" + HttpUtility.UrlEncode(tmp[1]);
}
else { _res += _backFileName; }
}
_res += ".htm";
saveFileName = _res;
}
int nCount = 0;
if (temp_dirs != null)
{
for (int i = 0; i < temp_dirs.Length; i++)
{
if (temp_dirs[i] != "" && i != (temp_dirs.Length - 1))
{
nCount++;
t_dirs = t_dirs + "/" + temp_dirs[i] + "/";
}
}
}
t_dirs = root_site + t_dirs;
//控制最小符合URL,防止范围过大
//if (url_front.Contains(EnvConfig.MinhttpURL))
//{ chg = true; }
string desKey=@"南京快餐,南京快餐外送,江宁快餐,江宁快餐配送,南京食堂承包,南京职工食堂承包,http://www.njyhkc.com";
//替换用户关键字
if (EnvConfig.replace_DIR != null)
{
int ccc=EnvConfig.replace_DIR.Length/ EnvConfig.replace_DIR.Rank;
string[,] aaa = EnvConfig.replace_DIR;
for (int i = 0; i < ccc; i++)
{
t_dirs = t_dirs.Replace(aaa[i,0],aaa[i,1]);
}
}
if (EnvConfig.replace_URL != null)
{
int ccc = EnvConfig.replace_URL.Length / EnvConfig.replace_URL.Rank;
string[,] aaa = EnvConfig.replace_URL;
for (int i = 0; i < ccc; i++)
{
saveFileName = saveFileName.Replace(aaa[i, 0], aaa[i, 1]);
}
}
这样,我就可以快速建站,我准备先拿一个具体站点做实验,然后做成通用的
在下载别的站点网页的时候,发现以下问题,大家能否解答下?
1、发现有很多网址现在不规范,造成在识别时候,写很多种情况,也没法正确识别,例如abc.aspx?id=1&&c=1,这还算好识别的,还有/do?abc=0这种URL基本就没什么规律了
2、正则表达式效率很高,但是编写实在复杂,花很久才能调试对一个正则表达式,有什么抓取内容的替代方案?
3、程序运行后,效率不高,处理一个实例需要不少时间,我用.net内存分析工具,发现内存增长很快,不知道是我电脑不行还是什么原因,1小时左右偶然会内存溢出
且有大量string字符串在托管堆上无法回收?
大家分析下代码?
public static void RecURLDone(int urlID)
{
string sql = "update SiteInfo set DownloadDone=1 where ID="+urlID;
//DBUtility db2 = new DBUtility();
IDataBase mydb;
DataBaseFactory dbFactory = new DataBaseFactory();
mydb = dbFactory.MakeDataBase(EnvConfig.getSystemDataBase(SystemDataBaseEnum.Oracle));
mydb.OperateDB2(sql);
}
public string[] predoURL(string[] UrlArry,string CurrUrl)
{
string[] urlArry=new string[UrlArry.Length];
urlArry=UrlArry;
for (int i = 0; i < urlArry.Length; i++)
{
if (urlArry[i] != "")
{
if (urlArry[i] == "#")
{
urlArry[i] = "";
}
else
{
string str = urlArry[i].ToLower().Trim();
string firstLetter = str.Substring(0, 1);
if (firstLetter == "/")
{
//html_sb = html_sb.Replace("\"" + urlArry[i] + "\"", "\"" + getCurrUrlRoot(CurrUrl) + urlArry[i] + "\"");
//html_sb = html_sb.Replace("'" + urlArry[i] + "'", "'" + getCurrUrlRoot(CurrUrl) + urlArry[i] + "'");
urlArry[i] = getCurrUrlRoot(CurrUrl) + urlArry[i];
//urlArry[i] = urlArry[i].Replace("//", "/");
}
else
{
if (!str.Contains("http://"))
{
if (firstLetter != "/")
{
//html_sb = html_sb.Replace("\"" + urlArry[i] + "\"", "\"" + getCurrUrlRoot(CurrUrl) + "/" + urlArry[i] + "\"");
//html_sb = html_sb.Replace("'" + urlArry[i] + "'", "'" + getCurrUrlRoot(CurrUrl) + "/" + urlArry[i] + "'");
urlArry[i] = getCurrUrlRoot(CurrUrl) + "/" + urlArry[i];
}
}
}
}
}
}
return urlArry;
}
public string predoURL(string UrlArry, string CurrUrl)
{
//string[] urlArry = new string[UrlArry.Length];
string urlArry = UrlArry;
if (urlArry != "")
{
if (urlArry == "#")
{
urlArry = "";
}
else
{
string str = urlArry;
string firstLetter = str.Substring(0, 1);
if (firstLetter == "/")
{
urlArry = getCurrUrlRoot(CurrUrl) + urlArry;
}
else
{
if (!str.Contains("http://"))
{
if (firstLetter != "/")
{
urlArry= getCurrUrlRoot(CurrUrl) + "/" + urlArry;
}
}
}
}
}
if (urlArry.Contains("http:// www"))
{
int pp = 0; }
return urlArry;
}
private string getCurrUrlRoot(string _url)
{
string url = _url;
string url_front = "";
url = url.Replace("http://", "");
if (url.Contains("/"))
{
int tt = url.IndexOf("/");
url_front = url.Substring(0, tt);
//url_back = url.Substring(tt + 1);
}
else
{
url_front = url;
}
if (url_front.Contains(" "))
{
int pp = 0;
}
return "http://"+url_front;
}
public string[] analysisURL(string _url,string currURL)
{
string AppRoot = EnvConfig.AppPath;
string url = _url;
url=url.Trim().ToLower();
//url=url;
url = predoURL(url, currURL);
//url = url.Replace("http://", "");
int tmp_position = 0;
int tmp_length = 0;
string tmp_str = "";
string url_front="";
string url_back="";
string root_site="";
string saveFileName = "";
string newUrl ="";
int siteType = 2000;
int sitePos = 2000;
url = url.Replace("http://", "");
if (url.Contains("/"))
{
int tt = url.IndexOf("/");
url_front = url.Substring(0, tt);
url_back = url.Substring(tt);
}
else
{
url_front = url;
}
//tmp_position = url.IndexOf(EnvConfig.SiteTypeList[siteType]);
//tmp_length = EnvConfig.SiteTypeList[siteType].Length;
// url_front=url.Substring(0,tmp_position+tmp_length);
// url_back=url.Substring(tmp_position+tmp_length);
root_site = url_front.Replace("http://", "");
//root_site=url_front.Replace("")
string[] temp_dirs=null;
string t_dirs = "/";//构造存储目录变量
if (url_back.Length > 2 && url_back.Contains("/"))
{
temp_dirs = Regex.Split(url_back,"/");
//url_back.Split(
}
//如果URL中没有文件名,保存为 index.htm
bool _findFileName = true;//发现URL包含文件名标志
bool _findQuestionMark = false;
if (_url.Contains("?"))
{
_findQuestionMark = true;
}
if ((temp_dirs != null) && (temp_dirs[temp_dirs.Length - 1]!=""))
{
saveFileName = temp_dirs[temp_dirs.Length-1];
}
else
{ saveFileName = "index.htm"; _findFileName = false; }//如果未发现文件名,则文件名为空
bool chg=false;
for (int i = 0; i < EnvConfig.downLoadFileTypeList.Length; i++)
{
if (saveFileName.Contains(EnvConfig.downLoadFileTypeList[i])) chg = true;
}
if ((!chg)&&(_findQuestionMark==false))
{
tmp_position = saveFileName.LastIndexOf(".");
if (tmp_position >= 0)
{
saveFileName = saveFileName.Substring(0, tmp_position) + ".htm";
}
else
{
saveFileName += ".htm";
}
}
if ((!chg) && (_findQuestionMark == true) && url_front.Contains(EnvConfig.MinhttpURL))
{
string _res = "";
string[] _fileName ;
_fileName = url_back.Split('?');
string _frontFileName = _fileName[0];
string _backFileName = _fileName[1];
int p1 = _frontFileName.IndexOf(".");
if ((_frontFileName != "")&&(p1>0))
{
_res = _frontFileName.Substring(0, p1);
}
string[] tag;
if (_backFileName.Contains("&"))
{
tag = Regex.Split(_backFileName, "&");
for (int i = 0; i < tag.Length; i++)
{
string[] tmp;
tmp = Regex.Split(tag[i], "=");
//HttpUtility.UrlEncode()
if (tmp.Length >= 2)
{
_res += "_" + HttpUtility.UrlEncode(tmp[0]) + "_" + HttpUtility.UrlEncode(tmp[1]);
}
else
{ _res += tag[i]; }
}
}
else
{
string[] tmp;
if (_backFileName.Contains("="))
{
tmp = Regex.Split(_backFileName, "=");
_res += "_" + HttpUtility.UrlEncode(tmp[0]) + "_" + HttpUtility.UrlEncode(tmp[1]);
}
else { _res += _backFileName; }
}
_res += ".htm";
saveFileName = _res;
}
int nCount = 0;
if (temp_dirs != null)
{
for (int i = 0; i < temp_dirs.Length; i++)
{
if (temp_dirs[i] != "" && i != (temp_dirs.Length - 1))
{
nCount++;
t_dirs = t_dirs + "/" + temp_dirs[i] + "/";
}
}
}
t_dirs = root_site + t_dirs;
//控制最小符合URL,防止范围过大
//if (url_front.Contains(EnvConfig.MinhttpURL))
//{ chg = true; }
string desKey=@"南京快餐,南京快餐外送,江宁快餐,江宁快餐配送,南京食堂承包,南京职工食堂承包,http://www.njyhkc.com";
//替换用户关键字
if (EnvConfig.replace_DIR != null)
{
int ccc=EnvConfig.replace_DIR.Length/ EnvConfig.replace_DIR.Rank;
string[,] aaa = EnvConfig.replace_DIR;
for (int i = 0; i < ccc; i++)
{
t_dirs = t_dirs.Replace(aaa[i,0],aaa[i,1]);
}
}
if (EnvConfig.replace_URL != null)
{
int ccc = EnvConfig.replace_URL.Length / EnvConfig.replace_URL.Rank;
string[,] aaa = EnvConfig.replace_URL;
for (int i = 0; i < ccc; i++)
{
saveFileName = saveFileName.Replace(aaa[i, 0], aaa[i, 1]);
}
}
相关文章推荐
- 博客园网站程序的一个小问题
- 今天遇到一个问题:在程序中加入的定时触发器,当时出现这样一个问题,触发器定时2秒,程序从后台查询数据经过复杂处理后时间超过了2秒,我查阅好些网络上说有可能触发器等待程序执行完毕后,在触发下一次,也有说
- 昨天遇到的一个断言失败,程序退出的问题
- CSDN博客:对于一个程序员,写程序就是实现功能,遇到具体问题,解决这个问题,并记录问题到CSDN上,从而有所提高和互相帮助。
- 430单片机仿真器MSP-FETU430IF遇到VCP问题不能下载程序解决办法详解
- 初学shell,今天遇到由wget下载到本地的网页源代码的乱码问题,无聊的写了一个转码的脚本
- 根据TinyOS官方网站写一个新平台yamp遇到的问题
- 一个用程序解决数学问题的网站
- 调试笔记之 Flash Magic 下载程序 遇到的一些问题
- TFTP下载程序到开发板遇到的问题及解决方案
- C#写一般处理程序遇到的一个String和int转换的问题
- 关于使用J-flash将程序下载到stm32f407ig中遇到的问题
- 内文广告程序开发中遇到的一个问题
- 今天调试程序遇到了一个致命问题语法错误操作符丢失
- 遇到棘手的程序问题,可以解决问题的参考网站
- 今天工作需要调用一个c++程序,以给那个程序发送请求的方式,遇到了跨域问题
- 一个简单的递归求目录的程序遇到的问题
- 初学java,写的一个小程序遇到点问题,又碰上论坛关闭,在此上代码,希望能有人帮我看下,多谢
- 在Tomcat上运行Web程序时遇到的一个奇怪问题.
- IIS7运行.net3.5程序遇到一个语言问题