- WebClient
下载webpage 到本地或string中
1 2 3 4 5 6 7 8 9 10 11 12 13
| System.Net.WebClient client = new WebClient(); byte[] page = client.DownloadData("http://www.google.com"); string content = System.Text.Encoding.UTF8.GetString(page); string regex = "href=[\\\"\\\'](http:\\/\\/|\\.\\/|\\/)?\\w+(\\.\\w+)*(\\/\\w+(\\.\\w+)?)*(\\/|\\?\\w*=\\w*(&\\w*=\\w*)*)?[\\\"\\\']"; Regex re = new Regex(regex); MatchCollection matches = re.Matches(content);
System.Collections.IEnumerator enu = matches.GetEnumerator(); while (enu.MoveNext() && enu.Current != null) { Match match = (Match)(enu.Current); Console.Write(match.Value + "\r\n"); }
|
根据正则表达式分析,上面是解析 href
的案例。
- Winista.Htmlparser.Net
获取HTML 树结构
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
| try {
txtHtmlWhole.Text = ""; string url = CBUrl.SelectedItem.ToString().Trim(); System.Net.WebClient aWebClient = new System.Net.WebClient(); aWebClient.Encoding = System.Text.Encoding.Default; string html = aWebClient.DownloadString(url); txtHtmlWhole.Text = html; } catch (Exception ex) { MessageBox.Show(ex.Message); } #endregion
#region 分析网页html节点 Lexer lexer = new Lexer(this.txtHtmlWhole.Text); Parser parser = new Parser(lexer); NodeList htmlNodes = parser.Parse(null);
|
- HtmlAgilityPack
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
| //初始化网络请求客户端 HtmlWeb webClient = new HtmlWeb(); //初始化文档 HtmlDocument doc = webClient.Load("http://www.cnblogs.com/"); //查找节点 HtmlNodeCollection titleNodes = doc.DocumentNode.SelectNodes("//a[@class='titlelnk']"); if (titleNodes != null) { foreach (var item in titleNodes) { Console.WriteLine(item.InnerText); } } Console.Read();
|
- SgmlReader
用这个工具先将html文件转成标准的xml格式文件,再通过制定xpath路径来提取所需要的内容(xpath路径可以通过上面的那个工具生成)。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
| XPathDocument pathDoc = null; using (SgmlReader sgmlReader = new SgmlReader()) { sgmlReader.DocType = "HTML"; sgmlReader.InputStream = new StringReader(html); using (StringWriter stringWriter = new StringWriter()) { using (XmlTextWriter xmlWriter = new XmlTextWriter(stringWriter)) { while (!sgmlReader.EOF) { xmlWriter.WriteNode(sgmlReader, true); } string xml = stringWriter.ToString().Replace("xmlns=\"http://www.w3.org/1999/xhtml\"", ""); pathDoc = new XPathDocument(new StringReader(xml)); } } } //提取出整个table string xpath = "//div[@class=\"infoList\"]/table";//xpath表达式 XPathNavigator nav = pathDoc.CreateNavigator(); XPathNodeIterator nodes = nav.Select(xpath); if (!nodes.MoveNext()) { return; } nodes = nodes.Current.Select("//tr"); if (!nodes.MoveNext()) return; string str = ""; while (nodes.MoveNext()) { //遍历所有行 XPathNodeIterator tdNode = nodes.Current.Select("./td"); while (tdNode.MoveNext()) { //遍历列 str += tdNode.Current.Value.Trim() + " "; } str += "\r\n"; } //输出结果 Console.WriteLine(str);
|