`
380071587
  • 浏览: 446330 次
  • 性别: Icon_minigender_1
  • 来自: 上海
社区版块
存档分类
最新评论

CollectionHelper-网页采集辅助类

 
阅读更多
using System;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;

namespace Helper
{
    /// <summary>
    ///     网页采集辅助类
    /// </summary>
    public static class CollectionHelper
    {
        /// <summary>
        ///     取得字符里的Dom元素 不包含元素属性
        /// </summary>
        /// <param name="source"></param>
        /// <param name="domElem"></param>
        /// <returns></returns>
        public static List<string> GetDomElem(string source, string domElem)
        {
            var matchList = new List<string>();
            string regStr = string.Format("<{0}[^>]*?>[\\s\\S]+?<\\/{0}>", domElem);
            try
            {
                var regex = new Regex(regStr, RegexOptions.Compiled | RegexOptions.IgnoreCase);
                MatchCollection matches = regex.Matches(source);
                foreach (Match match in matches)
                {
                    matchList.Add(match.Value);
                }
            }
            catch (Exception ex)
            {
                matchList.Add(ex.Message);
            }
            return matchList;
        }

        /// <summary>
        ///     取得字符里的Dom元素 包含元素属性 如:class="aa"
        /// </summary>
        /// <param name="source"></param>
        /// <param name="tagName"></param>
        /// <param name="tagValue"></param>
        /// <returns></returns>
        public static List<string> GetDomElemByAttr(string source, string tagName, string tagValue)
        {
            var matchList = new List<string>();
            string regStr =
                string.Format(
                    @"<(?<HtmlTag>[\w]+)[^>]*\s{0}[\s]*?=[\s]*?(?<Quote>[""']?){1}(?(Quote)\k<Quote>)[""']?[^>]*>((?<Nested><\k<HtmlTag>[^>]*>)|</\k<HtmlTag>>(?<-Nested>)|[\s\S]*?)*</\k<HtmlTag>>",
                    tagName.ToLower(), tagValue);
            try
            {
                var regex = new Regex(regStr, RegexOptions.Compiled | RegexOptions.IgnoreCase);
                MatchCollection matches = regex.Matches(source);
                foreach (Match match in matches)
                {
                    matchList.Add(match.Value);
                }
            }
            catch (Exception ex)
            {
                matchList.Add(ex.Message);
            }
            return matchList;
        }

        /// <summary>
        ///     取得字符里的A元素键值对  [name,url]
        /// </summary>
        /// <param name="source"></param>
        /// <returns></returns>
        public static Dictionary<string, string> GetDomElem_A(string source)
        {
            var matchList = new Dictionary<string, string>();
            const string pattern = "<a[^>]*? href=[\"'](?<url>[^\"']*?)[\"'][^>]*?>(?<text>[\\w\\W]*?)</a>";
            try
            {
                var regex = new Regex(pattern, RegexOptions.Compiled | RegexOptions.IgnoreCase);
                MatchCollection matches = regex.Matches(source);

                foreach (Match match in matches)
                {
                    string key = RemoveHtml(match.Value);
                    if (!matchList.ContainsKey(key))
                    {
                        matchList.Add(key, GetUrlArray(match.Value)[0]);
                    }
                }
            }
            catch (Exception ex)
            {
                matchList.Add(ex.Message, "");
            }
            return matchList;
        }


        /// <summary>
        ///     获取页面内容后,用匹配url正则表达式抓取内容中的url
        /// </summary>
        /// <param name="code">列表代码</param>
        /// <returns>返回截取后的URL地址</returns>
        public static List<string> GetUrlArray(string code)
        {
            var urlList = new List<string>();
            var regex =
                new Regex(@"(http://)?[\w-\.]*([\/]?[\w-])+[\w-]*\.(htm|html|shtm|shtml|aspx|asp|php|jsp)+[\w-\=\?]*",
                          RegexOptions.Compiled | RegexOptions.IgnoreCase);
            MatchCollection matches = regex.Matches(code);
            foreach (Match match in matches)
            {
                urlList.Add(match.Value);
            }
            return urlList;
        }

        /// <summary>
        ///     获取内容code中所有都图片地址
        /// </summary>
        /// <returns>返回截取后都图片地址</returns>
        public static Dictionary<string, string> GetImgUrlArray(string content)
        {
            var imgList = new Dictionary<string, string>();
            var reg = new Regex(@"<img[\s\S]*?src=(""(?<src>[^']*?)""|'(?<src>[^']*?)'|(?<src>[^>\s]*))[^>]*?>(.*?)");
            MatchCollection m = reg.Matches(content.ToLower());
            foreach (Match match in m)
            {
                string matchValue = match.Groups["src"].Value;
                if (!imgList.ContainsKey(matchValue))
                {
                    imgList.Add(matchValue, matchValue);
                }
            }
            return imgList;
        }

        /// <summary>
        ///     将相对地址转换为绝对地址
        /// </summary>
        /// <param name="relativeAddress">要转换的相对地址</param>
        /// <param name="absoluteAddress">当前网页地址</param>
        /// <returns>返回转换后的地址</returns>
        public static string ConvertToAbsluteUrl(string relativeAddress, string absoluteAddress)
        {
            if (string.IsNullOrEmpty(relativeAddress))
            {
                return string.Empty;
            }
            if (relativeAddress.Contains("://"))
            {
                return relativeAddress;
            }
            if (string.IsNullOrEmpty(absoluteAddress))
            {
                return string.Empty;
            }
            if (!absoluteAddress.Contains("://"))
            {
                return string.Empty;
            }
            var baseUrl = new Uri(absoluteAddress);
            var webrul = new Uri(baseUrl, relativeAddress);
            return webrul.ToString();
        }

        /// <summary>
        ///     替换所有HTML标签为空
        /// </summary>
        /// <param name="input">The string whose values should be replaced.</param>
        /// <returns>A string.</returns>
        public static string RemoveHtml(string input)
        {
            var stripTags = new Regex("</?[a-z][^<>]*>", RegexOptions.IgnoreCase);
            return stripTags.Replace(input, string.Empty);
        }

        /// <summary>
        ///     移除字符串中的空格及换行符
        /// </summary>
        /// <param name="input">The string whose values should be replaced.</param>
        /// <returns>A string.</returns>
        public static string RemoveBlank(string input)
        {
            input = input.Replace("\r", string.Empty);
            input = input.Replace("\n", string.Empty);
            input = input.Replace(" ", string.Empty);
            return input;
        }

        // 获取网页的HTML内容,根据网页的charset自动判断Encoding  
        public static string GetHtml(string url)
        {
            return GetHtml(url, null);
        }

        // 获取网页的HTML内容,指定Encoding  
        private static string GetHtml(string url, Encoding encoding)
        {
            string getSource;
            try
            {
                byte[] buf = new WebClient().DownloadData(url);
                if (encoding != null) return encoding.GetString(buf);
                string html = Encoding.UTF8.GetString(buf);
                encoding = GetEncoding(html);
                if (encoding == null || (Equals(encoding, Encoding.UTF8))) return html;
                getSource = encoding.GetString(buf);
            }
            catch (NotSupportedException exception)
            {
                getSource = exception.Message;
            }
            catch (InvalidOperationException exception)
            {
                getSource = exception.Message;
            }
            catch (IOException exception)
            {
                getSource = exception.Message;
            }
            return getSource;
        }

        /// <summary>
        ///     根据网页的HTML内容提取网页的Encoding
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        private static Encoding GetEncoding(string html)
        {
            const string pattern = @"(?i)\bcharset=(?<charset>[-a-zA-Z_0-9]+)";
            string charset = Regex.Match(html, pattern).Groups["charset"].Value;
            try
            {
                return Encoding.GetEncoding(charset);
            }
            catch (ArgumentException)
            {
                return null;
            }
        }
    }
}

分享到:
评论

相关推荐

    WPF应用程序框架(WAF)v2.5.0.7源码2012825

    PropertyChanged:提供了一个辅助方法来测试如果一个属性改变事件是当一个特定的行动提出被执行。 CanExecuteChangedEvent:一个helper方法来测试一个CanExecute改变事件是当一个特定的行动提出被执行。 v2507更新...

    Json、Webservice、Jquery、Ajax

    版本号:v1.0 创建人:王国胜 版本创建日期:2011-03-30... CollectionHelper.cs List与DataTable相互转换类 JsonAndDateTable.cs Json与DataTable相互转换类 Default.aspx Json、Webservice、Jquery、Ajax测试页

    JAVA 工具类 toolkit

    对常用的处理封装 JsonResult、PageBean、exception、excel、FtpHelper、HttpHelper、AESHelper、DESHelper、RSAHelper、ChineseUtil、ClassUtil、CollectionHelper、ConvertUtil、DateUtil、FileUtil、ImageUtil、...

    Tup.Utilities:一组 C# 工具助手类(C # tools a set of helper classes)

    CollectionHelper 集合处理 工具类 DateTimeHelper 时间操作 工具类 BatchHelper 批量执行动作 工具类 FieldHelper DataTable/IDataReader 数据字段 工具类 RetryHelper 重试 工具类 JsonHelper Newtonsoft.Json ...

    SparkGraphx计算指定节点的N度关系节点

    import horizon.graphx.util.CollectionUtil.CollectionHelper import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import scala.collection.mutable...

Global site tag (gtag.js) - Google Analytics