您的位置:首页 > Web前端 > HTML

C# 正则表达式,去除所有HTML标签

2015-03-01 16:58 706 查看
protected string str = "<table><tr><td>sdasasdsdd</td></tr></table><br><p>sds</p><img id='img1' src='http://www.baidu.com/img/baidu_logo.gif' width='100' height='50' alt=''>aaassss<br><img src='http://www.baidu.com/img/baidu_logo.gif'
width='100' height='50' alt=''> 说是道 ";

    protected void Page_Load(object sender, EventArgs e)

    {

        //string regexstr = @"<[^>]*>";    //去除所有的标签

        //@"<script[^>]*?>.*?</script>" //去除所有脚本,中间部分也删除

         

        // string regexstr = @"<img[^>]*>";   //去除图片的正则

       // string regexstr = @"<(?!br).*?>";   //去除所有标签,只剩br

        // string regexstr = @"<table[^>]*?>.*?</table>";   //去除table里面的所有内容

        string regexstr = @"<(?!img|br|p|/p).*?>";   //去除所有标签,只剩img,br,p

   

        str = Regex.Replace(str, regexstr, string.Empty, RegexOptions.IgnoreCase);

    }

 

 

 

 

 

ASP.NET 去除所有HTML标记 < type="text/javascript">function StorePage(){d=document;t=d.selection?(d.selection.type!='None'?d.selection.createRange().text:''):(d.getSelection?d.getSelection():'');void(keyit=window.open('http://www.365key.com/storeit.aspx?t='+escape(d.title)+'&u='+escape(d.location.href)+'&c='+escape(t),'keyit','scrollbars=no,width=475,height=575,left=75,top=20,status=no,resizable=yes'));keyit.focus();}

注意:需要先using  System.Text.RegularExpressions;  

/**////   <summary>   

  ///   去除HTML标记   

  ///   </summary>   

  ///   <param   name="NoHTML">包括HTML的源码   </param>   

  ///   <returns>已经去除后的文字</returns>   

  public   static   string   NoHTML(string   Htmlstring)   

  {   

  //删除脚本   

  Htmlstring   =   Regex.Replace(Htmlstring,@"<script[^>]*?>.*?</script>","",RegexOptions.IgnoreCase);   

  //删除HTML   

  Htmlstring   =   Regex.Replace(Htmlstring,@"<(.[^>]*)>","",RegexOptions.IgnoreCase);   

  Htmlstring   =   Regex.Replace(Htmlstring,@"([\r\n])[\s]+","",RegexOptions.IgnoreCase);   

  Htmlstring   =   Regex.Replace(Htmlstring,@"-->","",RegexOptions.IgnoreCase);   

  Htmlstring   =   Regex.Replace(Htmlstring,@"<!--.*","",RegexOptions.IgnoreCase);   

    

  Htmlstring   =   Regex.Replace(Htmlstring,@"&(quot|#34);","\"",RegexOptions.IgnoreCase);   

  Htmlstring   =   Regex.Replace(Htmlstring,@"&(amp|#38);","&",RegexOptions.IgnoreCase);   

  Htmlstring   =   Regex.Replace(Htmlstring,@"&(lt|#60);","<",RegexOptions.IgnoreCase);   

  Htmlstring   =   Regex.Replace(Htmlstring,@"&(gt|#62);",">",RegexOptions.IgnoreCase);   

  Htmlstring   =   Regex.Replace(Htmlstring,@"&(nbsp|#160);","   ",RegexOptions.IgnoreCase);   

  Htmlstring   =   Regex.Replace(Htmlstring,@"&(iexcl|#161);","\xa1",RegexOptions.IgnoreCase);   

  Htmlstring   =   Regex.Replace(Htmlstring,@"&(cent|#162);","\xa2",RegexOptions.IgnoreCase);   

  Htmlstring   =   Regex.Replace(Htmlstring,@"&(pound|#163);","\xa3",RegexOptions.IgnoreCase);   

  Htmlstring   =   Regex.Replace(Htmlstring,@"&(copy|#169);","\xa9",RegexOptions.IgnoreCase);   

  Htmlstring   =   Regex.Replace(Htmlstring,   @"&#(\d+);","",RegexOptions.IgnoreCase);   

    

  Htmlstring.Replace("<","");   

  Htmlstring.Replace(">","");   

  Htmlstring.Replace("\r\n","");   

  Htmlstring=HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();   

    

  return   Htmlstring;   

  }

/**////提取HTML代码中文字的C#函数     

  ///   <summary>   

  ///   去除HTML标记   

  ///   </summary>   

  ///   <param   name="strHtml">包括HTML的源码   </param>   

  ///   <returns>已经去除后的文字</returns>   

  using   System;   

  using   System.Text.RegularExpressions;   

  public   class   StripHTMLTest{   

      public   static   void   Main(){   

          string   s=StripHTML("<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>");   

          Console.WriteLine(s);   

      }   

    

      public   static   string   StripHTML(string   strHtml){   

          string   []   aryReg   ={   

                      @"<script[^>]*?>.*?</script>",   

    

                      @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(http://www.cnblogs.com/xchit/admin/file://[%22%22'tbnr]%7c[%5e/7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",   

                      @"([\r\n])[\s]+",   

                      @"&(quot|#34);",   

                      @"&(amp|#38);",   

                      @"&(lt|#60);",   

                      @"&(gt|#62);",     

                      @"&(nbsp|#160);",     

                      @"&(iexcl|#161);",   

                      @"&(cent|#162);",   

                      @"&(pound|#163);",   

                      @"&(copy|#169);",   

                      @"&#(\d+);",   

                      @"-->",   

                      @"<!--.*\n"   

                    };   

4000
    

          string   []   aryRep   =   {   

                        "",   

                        "",   

                        "",   

                        "\"",   

                        "&",   

                        "<",   

                        ">",   

                        "   ",   

                        "\xa1",//chr(161),   

                        "\xa2",//chr(162),   

                        "\xa3",//chr(163),   

                        "\xa9",//chr(169),   

                        "",   

                        "\r\n",   

                        ""   

                      };   

    

          string   newReg   =aryReg[0];   

          string   strOutput=strHtml;   

          for(int   i   =   0;i<aryReg.Length;i++){   

              Regex   regex   =   new   Regex(aryReg[i],RegexOptions.IgnoreCase);   

              strOutput   =   regex.Replace(strOutput,aryRep[i]);   

          }   

          strOutput.Replace("<","");   

          strOutput.Replace(">","");   

          strOutput.Replace("\r\n","");   

          return   strOutput;   

      }   

  }

写一个静态方法   

  移除HTML标签#region   移除HTML标签   

  /**////   <summary>   

  ///   移除HTML标签   

  ///   </summary>   

  ///   <param   name="HTMLStr">HTMLStr</param>   

  public   static   string     ParseTags(string   HTMLStr)   

  {   

  return   System.Text.RegularExpressions.Regex.Replace(HTMLStr,   "<[^>]*>",   "");     

  }   

    

  #endregion   

    

                  取出文本中的图片地址#region   取出文本中的图片地址   

                  /**////   <summary>   

                  ///   取出文本中的图片地址   

                  ///   </summary>   

                  ///   <param   name="HTMLStr">HTMLStr</param>   

                  public   static   string   GetImgUrl(string   HTMLStr)   

                  {   

                          string   str   =   string.Empty;   

                          string   sPattern   =   @"^<img\s+[^>]*>";   

                          Regex   r   =   new   Regex(@"<img\s+[^>]*\s*src\s*=\s*([']?)(?<url>\S+)'?[^>]*>",   

                                  RegexOptions.Compiled);   

                          Match   m   =   r.Match(HTMLStr.ToLower());   

                          if   (m.Success)   

                                  str   =   m.Result("${url}");   

                          return   str;   

                  }   

    

                  #endregion
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: