首页 > 编程 > .NET > 正文

c#中过滤html的正则表达式

2020-01-18 01:48:50
字体:
来源:转载
供稿:网友

实现代码

///  <summary>///  去除HTML标记///  </summary>///  <param  name=”NoHTML”>包括HTML的源码  </param>///  <returns>已经去除后的文字</returns>public static string NoHTML(string Htmlstring){  //删除脚本  Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "",  RegexOptions.IgnoreCase);  //删除HTML   Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "",  RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"([/r/n])[/s]+", "",  RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @">", "", RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"<!.*", "", RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "/"",  RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&",  RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<",  RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">",  RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", "  ",  RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "/xa1", RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "/xa2", RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "/xa3", RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "/xa9", RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&#(/d+);", "", RegexOptions.IgnoreCase);  Htmlstring.Replace("<", "");  Htmlstring.Replace(">", "");  Htmlstring.Replace("/r/n", "");  Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();  return Htmlstring;}

C#过滤Html标签及空格

public static string FilterHTML(string HTMLStr)    {      if (!string.IsNullOrEmpty(HTMLStr))        return System.Text.RegularExpressions.Regex.Replace(HTMLStr, "<[^>]*>| ", "");      else        return "";    }

写一个静态方法移除HTML标签

#region///  <summary>///  移除HTML标签///  </summary>///  <param  name="HTMLStr">HTMLStr</param>public static string ParseTags(string HTMLStr){ return System.Text.RegularExpressions.Regex.Replace(HTMLStr, "<[^>]*>", "");}#endregion

取出文本中的图片地址

#region///  <summary>///  取出文本中的图片地址///  </summary>///  <param  name="HTMLStr">HTMLStr</param>public static string GetImgUrl(string HTMLStr){ string str = string.Empty; string sPattern = @"^<img/s+[^>]*>"; Regex r = new Regex(@"<img/s+[^>]*/s*src/s*=/s*([']?)(?<url>/S+)'?[^>]*>",  RegexOptions.Compiled); Match m = r.Match(HTMLStr.ToLower()); if (m.Success)  str = m.Result("${url}"); return str;}#endregion

提取HTML代码中文字的C#函数

///  <summary>///  提取HTML代码中文字的C#函数///  </summary>///  <param  name="strHtml">包括HTML的源码  </param>///  <returns>已经去除后的文字</returns>using System;using System.Text.RegularExpressions;public class StripHTMLTest{ public static void Main() {  string s = StripHTML(   "<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>");  Console.WriteLine(s); } public static string StripHTML(string strHtml) {  string[]aryReg =  {   @"<script[^>]*?>.*?</script>",   @"<(///s*)?!?((/w+:)?/w+)(/w+(/s*=?/s*(([""'])(//["    "'tbnr]|[^/7])*?/7|/w+)|.{0})|/s)*?(///s*)?>", @"([/r/n])[/s]+", @    "&(quot|#34);", @"&(amp|#38);", @"&(lt|#60);", @"&(gt|#62);", @    "&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);",    @"&(copy|#169);", @"&#(/d+);", @"-->", @"<!--.*/n"  };  string[]aryRep =  {   "", "", "", "/"", "&", "<", ">", "  ", "/xa1", //chr(161),   "/xa2", //chr(162),   "/xa3", //chr(163),   "/xa9", //chr(169),   "", "/r/n", ""  };  string newReg = aryReg[0];  string strOutput = strHtml;  for (int i = 0; i < aryReg.Length; i++)  {   Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase);   strOutput = regex.Replace(strOutput, aryRep[i]);  }  strOutput.Replace("<", "");  strOutput.Replace(">", "");  strOutput.Replace("/r/n", "");  return strOutput; }}

TempContent 表示包含有html的字符串;
TempContent = System.Text.RegularExpressions.Regex.Replace(TempContent,"<[^>]+>","");至少一个
TempContent = System.Text.RegularExpressions.Regex.Replace(TempContent,"<[^>]*>","");任意个 

发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表