C#抓取网页内容常用类
public class HtmlUtil
{
#region 获取指定ID的标签内容
/// <summary>
/// 获取指定ID的标签内容
/// </summary>
/// <param name="html">HTML源码</param>
/// <param name="id">标签ID</param>
/// <returns></returns>
public static string GetElementById(string html, string id)
{
string pattern = @"<([a-z]+)(?:(?!id)[^<>])*id=([""']?){0}\2[^>]*>(?>(?<o><\1[^>]*>)|(?<-o></\1>)|(?:(?!</?\1).))*(?(o)(?!))</\1>";
pattern = string.Format(pattern, Regex.Escape(id));
Match match = Regex.Match(html, pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase);
return match.Success ? match.Value : "";
}
#endregion
#region 通过class属性获取对应标签集合
/// <summary>
/// 通过class属性获取对应标签集合
/// </summary>
/// <param name="html">HTML源码</param>
/// <param name="className">class值</param>
/// <returns></returns>
public static string[] GetElementsByClass(string html, string className)
{
return GetElements(html, "", className);
}
#endregion
#region 通过标签名获取标签集合
/// <summary>
/// 通过标签名获取标签集合
/// </summary>
/// <param name="html">HTML源码</param>
/// <param name="tagName">标签名(如div)</param>
/// <returns></returns>
public static string[] GetElementsByTagName(string html, string tagName)
{
return GetElements(html, tagName, "");
}
#endregion
#region 通过同时指定标签名+class值获取标签集合
/// <summary>
/// 通过同时指定标签名+class值获取标签集合
/// </summary>
/// <param name="html">HTML源码</param>
/// <param name="tagName">标签名</param>
/// <param name="className">class值</param>
/// <returns></returns>
public static string[] GetElementsByTagAndClass(string html, string tagName, string className)
{
return GetElements(html, tagName, className);
}
#endregion
#region 通过同时指定标签名+class值获取标签集合(内部方法)
/// <summary>
/// 通过同时指定标签名+class值获取标签集合(内部方法)
/// </summary>
/// <param name="html"></param>
/// <param name="tagName"></param>
/// <param name="className"></param>
/// <returns></returns>
private static string[] GetElements(string html, string tagName, string className)
{
string pattern = "";
if (tagName != "" && className != "")
{
pattern = @"<({0})(?:(?!class)[^<>])*class=([""']?){1}\2[^>]*>(?>(?<o><\1[^>]*>)|(?<-o></\1>)|(?:(?!</?\1).))*(?(o)(?!))</\1>";
pattern = string.Format(pattern, Regex.Escape(tagName), Regex.Escape(className));
}
else if (tagName != "")
{
pattern = @"<({0})(?:[^<>])*>(?>(?<o><\1[^>]*>)|(?<-o></\1>)|(?:(?!</?\1).))*(?(o)(?!))</\1>";
pattern = string.Format(pattern, Regex.Escape(tagName));
}
else if (className != "")
{
pattern = @"<([a-z]+)(?:(?!class)[^<>])*class=([""']?){0}\2[^>]*>(?>(?<o><\1[^>]*>)|(?<-o></\1>)|(?:(?!</?\1).))*(?(o)(?!))</\1>";
pattern = string.Format(pattern, Regex.Escape(className));
}
if (pattern == "")
{
return new string[] { };
}
List<string> list = new List<string>();
Regex reg = new Regex(pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase);
Match match = reg.Match(html);
while (match.Success)
{
list.Add(match.Value);
match = reg.Match(html, match.Index + match.Length);
}
return list.ToArray();
}
/// <summary>
/// 根据正则获取内容
/// </summary>
/// <param name="text">内容</param>
/// <param name="pat">正则</param>
public static string[] GetListByHtml(string text, string pat)
{
List<string> list = new List<string>();
Regex r = new Regex(pat, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
Match m = r.Match(text);
//int matchCount = 0;
while (m.Success)
{
list.Add(m.Value);
m = m.NextMatch();
}
return list.ToArray();
}
/// <summary>
/// 去除html标签
/// </summary>
/// <param name="Htmlstring"></param>
/// <returns></returns>
public static string NoHTML(string Htmlstring)
{
//删除脚本
Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
//删除HTML
Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&",RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2",RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3",RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
Htmlstring.Replace("<", "");
Htmlstring.Replace(">", "");
Htmlstring.Replace("\r\n", "");
Htmlstring = Htmlstring.Trim();
return Htmlstring;
}
/// <summary>
/// 取得HTML中所有图片的 URL。
/// </summary>
/// <param name="sHtmlText">HTML代码</param>
/// <returns>图片的URL列表</returns>
public static string[] GetHtmlImageUrlList(string sHtmlText)
{
// 定义正则表达式用来匹配 img 标签
Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
// 搜索匹配的字符串
MatchCollection matches = regImg.Matches(sHtmlText);
int i = 0;
string[] sUrlList = new string[matches.Count];
// 取得匹配项列表
foreach (Match match in matches)
sUrlList[i++] = match.Groups["imgUrl"].Value;
return sUrlList;
}
/// <summary>
/// 获取img的alt标签
/// </summary>
/// <param name="strHtml"></param>
/// <returns></returns>
public static string[] GetHtmlAltList(string strHtml)
{
// 定义正则表达式用来匹配 img 标签
Regex regImg = new Regex(@"<img\b[^<>]*?\balt[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgAlt>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
// 搜索匹配的字符串
MatchCollection matches = regImg.Matches(strHtml);
int i = 0;
string[] sUrlList = new string[matches.Count];
// 取得匹配项列表
foreach (Match match in matches)
sUrlList[i++] = match.Groups["imgAlt"].Value;
return sUrlList;
}
#endregion
}