您的位置:首页 > Web前端 > HTML

提取HTML代码中文字的C#函数

2005-01-22 09:12 411 查看
  public static string StripHTML(string strHtml)

     string [] aryReg =          @"<script[^>]*?>.*?</script>",

          @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",

          @"([\r\n])[\s]+",

          @"&(quot|#34);",

          @"&(amp|#38);",

          @"&(lt|#60);",

          @"&(gt|#62);", 

          @"&(nbsp|#160);", 

          @"&(iexcl|#161);",

          @"&(cent|#162);",

          @"&(pound|#163);",

          @"&(copy|#169);",

          @"&#(\d+);",

          @"-->",

          @"<!--.*\n"

         

         };

   string [] aryRep =            "",

           "",

           "",

           "\"",

           "&",

           "<",

           ">",

           " ",

           "\xa1",//chr(161),

           "\xa2",//chr(162),

           "\xa3",//chr(163),

           "\xa9",//chr(169),

           "",

           "\r\n",

           ""

          };

   string newReg =aryReg[0];

   string strOutput=strHtml;

   for(int i = 0;i<aryReg.Length;i++)

       Regex regex = new Regex(aryReg[i],RegexOptions.IgnoreCase );

    strOutput = regex.Replace(strOutput,aryRep[i]);

   }

   strOutput.Replace("<","");

   strOutput.Replace(">","");

   strOutput.Replace("\r\n","");

   return strOutput;

  }

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: