首页 > 学院 > 开发设计 > 正文

HTML解析类 ,让你不使用正则也能轻松获取HTML相关元素 -C# .NET

2019-11-17 02:24:24
字体:
来源:转载
供稿:网友

HTML解析类 ,让你不使用正则也能轻松获取HTML相关元素 -C# .NET

功能:

1、轻松获取指元素HTML元素。

2、可以根据属性标签进行筛选

3、返回的都是Llist强类型无需转换

用过XElement的都知道 用来解析xml非常的方便,但是对于HTML的格式多样化实在是没办法兼容。

所以我就写了这么一个类似XElement的 XHTMLElement

用法:

            string filePath = Server.MapPath("~/file/test.htm");            //获取HTML代码            string mailBody = FileHelper.FileToString(filePath);            XHtmlElement xh = new XHtmlElement(mailBody);            //获取body的子集a标签并且class="icon"            var link = xh.Descendants("body").ChildDescendants("a").Where(c => c.Attributes.Any(a => a.Key == "class" && a.Value == "icon")).ToList();            //获取带href的a元素            var links = xh.Descendants("a").Where(c => c.Attributes.Any(a => a.Key == "href")).ToList();            foreach (var r in links)            {                Response.Write(r.Attributes.Single(c => c.Key == "href").Value); //出输href            }            //获取第一个img            var img = xh.Descendants("img");            //获取最近的第一个p元素以及与他同一级的其它p元素            var ps = xh.Descendants("p");

代码:

using System;using System.Collections.Generic;using System.Linq;using System.Web;using System.Text;using System.Text.RegularExPRessions;namespace SyntacticSugar{    /// <summary>    /// ** 描述:html解析类    /// ** 创始时间:2015-4-23    /// ** 修改时间:-    /// ** 作者:sunkaixuan    /// ** QQ:610262374 欢迎交流,共同提高 ,命名语法等写的不好的地方欢迎大家的给出宝贵建议    /// </summary>    public class XHtmlElement    {        private string _html;        public XHtmlElement(string html)        {            _html = html;        }        /// <summary>        /// 获取最近的相同层级的HTML元素        /// </summary>        /// <param name="elementName">等于null为所有元素</param>        /// <returns></returns>        public List<HtmlInfo> Descendants(string elementName = null)        {            if (_html == null)            {                throw new ArgumentNullException("html不能这空!");            }            var allList = RootDescendants(_html);            var reval = allList.Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();            if (reval == null || reval.Count == 0)            {                reval = GetDescendantsSource(allList, elementName);            }            return reval;        }        /// <summary>        /// 获取第一级元素        /// </summary>        /// <param name="elementName"></param>        /// <returns></returns>        public List<HtmlInfo> RootDescendants(string html = null)        {            /*             * 业务逻辑:                         * 1、获取第一个html标签一直找结尾标签,如果在这个过程中遇到相同的标签收尾标签就要加1                         * 2、第一个标签取到后继续第一步操作,找第2个元素 。。第N个元素             */            if (html == null) html = _html;            var firstTag = Regex.Match(html, "<.+?>");            List<string> eleList = new List<string>();            List<HtmlInfo> reval = new List<HtmlInfo>();            GetElementsStringList(html, ref eleList);            foreach (var r in eleList)            {                HtmlInfo data = new HtmlInfo();                data.OldFullHtml = r;                data.SameLeveHtml = html;                data.TagName = Regex.Match(r, @"(?<=/s{1}|/<)[a-z,A-Z]+(?=/>|/s)", RegexOptions.IgnoreCase).Value;                data.InnerHtml = Regex.Match(r, @"(?<=/>).+(?=<)", RegexOptions.Singleline).Value;                var eleBegin = Regex.Match(r, "<.+?>").Value;                var attrList = Regex.Matches(eleBegin, @"[a-z,A-Z]+/="".+?""").Cast<Match>().Select(c => new { key = c.Value.Split('=').First(), value = c.Value.Split('=').Last().TrimEnd('"').TrimStart('"') }).ToList();                data.Attributes = new Dictionary<string, string>();                if (attrList != null && attrList.Count > 0)                {                    foreach (var a in attrList)                    {                        data.Attributes.Add(a.key, a.value);                    }                }                reval.Add(data);            }            return reval;        }        #region private        private List<HtmlInfo> GetDescendantsSource(List<HtmlInfo> allList, string elementName)        {            foreach (var r in allList)            {                if (r.InnerHtml == null || !r.InnerHtml.Contains("<")) continue;                var childList = RootDescendants(r.InnerHtml).Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();                if (childList == null || childList.Count == 0)                {                    childList = GetDescendantsSource(RootDescendants(r.InnerHtml), elementName);                    if (childList != null && childList.Count > 0)                        return childList;                }                else                {                    return childList;                }            }            return null;        }        private void GetElementsStringList(string html, ref List<string> eleList)        {            HtmlInfo info = new HtmlInfo();            info.TagName = Regex.Match(html, @"(?<=/</s{0,5}|/<)([a-z,A-Z]+|h/d{1})(?=/>|/s)", RegexOptions.IgnoreCase).Value;            string currentTagBeginReg = @"</s{0,10}" + info.TagName + @".*?>";//获取当前标签元素开始标签正则            string currentTagEndReg = @"/<//" + info.TagName + @"/>";//获取当前标签元素收尾标签正则            if (string.IsNullOrEmpty(info.TagName)) return;            string eleHtml = "";            //情况1 <a/>            //情况2 <a></a>            //情况3 <a> 错误格式            //情况4endif            if (Regex.IsMatch(html, @"</s{0,10}" + info.TagName + "[^<].*?/>"))//单标签            {                eleHtml = Regex.Match(html, @"</s{0,10}" + info.TagName + "[^<].*?/>").Value;            }            else if (!Regex.IsMatch(html, currentTagEndReg))//没有收尾            {                if (Regex.IsMatch(html, @"/s{0,10}/</!/-/-/[if"))                {                    eleHtml = GetElementString(html, @"/s{0,10}/</!/-/-/[if", @"/[endif/]/-/-/>", 1);                }                else                {                    eleHtml = Regex.Match(html, currentTagBeginReg,RegexOptions.Singleline).Value;                }            }            else            {                eleHtml = GetElementString(html, currentTagBeginReg, currentTagEndReg, 1);            }            try            {                eleList.Add(eleHtml);                html = html.Replace(eleHtml, "");                html = Regex.Replace(html, @"</!DOCTYPE.*?>", "");                if (!Regex.IsMatch(html, @"^/s*$"))                {                    GetElementsStringList(html, ref eleList);                }            }            catch (Exception ex)            {                throw new Exception("SORRY,您的HTML格式不能解析!!!");            }        }        private string GetElementString(string html, string currentTagBeginReg, string currentTagEndReg, int i)        {            string newHtml = GetRegNextByNum(html, currentTagBeginReg, currentTagEndReg, i);            var currentTagBeginMatches = Regex.Matches(newHtml, currentTagBeginReg, RegexOptions.Singleline).Cast<Match>().Select(c => c.Value).ToList();            var currentTagEndMatches = Regex.Matches(newHtml, currentTagEndReg).Cast<Match>().Select(c => c.Value).ToList();            if (currentTagBeginMatches.Count == currentTagEndMatches.Count)            { //两个签标元素相等                return newHtml;            }            return GetElementString(html, currentTagBeginReg, currentTagEndReg, ++i);        }        private string GetRegNextByNum(string val, string currentTagBeginReg, string currentTagEndReg, int i)        {            return Regex.Match(val, currentTagBeginReg + @"((.*?)" + currentTagEndReg + "){" + i + "}?", RegexOptions.IgnoreCase | RegexOptions.Singleline).Value;        }        #endregion    }    public static class XHtmlElementExtendsion    {        /// <summa
发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表