当前位置: 代码迷 >> ASP.NET >> 一个简单的网页数据采集,有什么好方法
  详细解决方案

一个简单的网页数据采集,有什么好方法

热度:8184   发布时间:2013-02-26 00:00:00.0
一个简单的网页数据采集,请教大家有什么好方法
客户的需求是,在和他们同类型网站中采集实时的商品及价格。然后与自己的数据进行比对

打个比方说,客户和它的同行的网站中,几乎都有下面这样的表格显示

类别               价格

商品1             2.63元
商品2             3.41元
商品3             1.28元

在这里,商品的种类差不多都是一样的,都是那么几种,但各网站可能有不同的表现方法,
有的是用
<table>
<tr> <th> 类别 </th> <th> 价格 </th>
<tr> <td> 商品1 </td> <td> 2.63元 </td>
</table>
有的是用div的无序列表来显示。

而且出现的文字形式也有可能不同,例如:

类别               价格

商品一           $2.63
商品二           $3.41
商品三           $1.28

----------------------------------------------------------

我想先实现一个DEMO,大家看看怎么实现

假如有一个TextBox为txtUrl,输入客户同行的网站URL
   
然后将采集回来的数据经过过滤,返回“类别”和“价格”两种数据,然后写入到数据库

大家给提供个思路,以及关键的类和方法。有示例代码更好。

------解决方案--------------------------------------------------------
//昨天刚写的一个临时用来学习的.你参考一下.using System;using System.Drawing;using System.Collections;using System.ComponentModel;using System.Windows.Forms;using System.Data;using System.Net;using System.Text.RegularExpressions;using System.Xml;using System.IO;namespace GetQ{ /// <summary> /// Form1 的摘要说明。 /// </summary> public class Form1 : System.Windows.Forms.Form { private System.Windows.Forms.Button button1; private System.Windows.Forms.RichTextBox txtLog; /// <summary> /// 必需的设计器变量。 /// </summary> private System.ComponentModel.Container components = null; public Form1() { // // Windows 窗体设计器支持所必需的 // InitializeComponent(); // // TODO: 在 InitializeComponent 调用后添加任何构造函数代码 // } /// <summary> /// 清理所有正在使用的资源。 /// </summary> protected override void Dispose( bool disposing ) { if( disposing ) { if (components != null) { components.Dispose(); } } base.Dispose( disposing ); } #region Windows 窗体设计器生成的代码 /// <summary> /// 设计器支持所需的方法 - 不要使用代码编辑器修改 /// 此方法的内容。 /// </summary> private void InitializeComponent() { System.Resources.ResourceManager resources = new System.Resources.ResourceManager(typeof(Form1)); this.button1 = new System.Windows.Forms.Button(); this.txtLog = new System.Windows.Forms.RichTextBox(); this.SuspendLayout(); // // button1 // this.button1.Location = new System.Drawing.Point(424, 464); this.button1.Name = "button1 "; this.button1.TabIndex = 0; this.button1.Text = "开始 "; this.button1.Click += new System.EventHandler(this.button1_Click); // // txtLog // this.txtLog.Location = new System.Drawing.Point(32, 48); this.txtLog.Name = "txtLog "; this.txtLog.Size = new System.Drawing.Size(608, 360); this.txtLog.TabIndex = 1; this.txtLog.Text = " "; // // Form1 // this.AutoScaleBaseSize = new System.Drawing.Size(6, 14); this.ClientSize = new System.Drawing.Size(728, 525); this.Controls.Add(this.txtLog); this.Controls.Add(this.button1); this.Icon = ((System.Drawing.Icon)(resources.GetObject( "$this.Icon "))); this.Name = "Form1 "; this.Text = "数据抓取 "; this.Load += new System.EventHandler(this.Form1_Load); this.ResumeLayout(false); } #endregion private WebClient c = null; void DoGet() { string Url = "http://www.gdgajj.com/wzks/index.jsp "; //WebClient c = new WebClient(); byte[] b = this.c.DownloadData(Url); string strOrgHTML = System.Text.Encoding.Default.GetString(b); Regex reg = new Regex(@ " <script language= " "JavaScript " "> ([\s\S]*?) </script> "); //取得脚本 MatchCollection ms = reg.Matches(strOrgHTML); if(ms.Count < 2) { MessageBox.Show( "没有解析,退出! "); return; } string sScript = ms[1].Result( "$1 "); //取得试题内容 reg = new Regex(@ " <table width= " "100% " " border= " "0 " " cellpadding= " "2 " " cellspacing= " "1 " " bgcolor= " "#999999 " "> ([\s\S]*?) </table> "); ms = reg.Matches(strOrgHTML); if(ms.Count == 0) { MessageBox.Show( "没有解析,退出! "); return; } string strHTML = ms[0].Result( "$1 "); strOrgHTML = " "; //处理试题 strHTML = Regex.Replace(strHTML,@ " <(?!img|input)([^> \s]*)[^> ]*?> ", " <$1> ",RegexOptions.IgnoreCase); strHTML = Regex.Replace(strHTML, " </?div[^> ]*?> ", " "); strHTML = Regex.Replace(strHTML, " </?font[^> ]*?> ", " "); strHTML = Regex.Replace(strHTML,@ " <input type= " "radio " " name= " "([^ " "]*?) " "\s*value= " "([A-E]) " "> ", "^$1@$2@ "); strHTML = Regex.Replace(strHTML,@ " <img\s*src= " "([^ " "]*?) " "> ", " <img> $1 </img> "); strHTML = " <table> "+ strHTML + " </table> "; XmlDocument dom = new XmlDocument(); dom.LoadXml(strHTML); //this.WriteLog(strHTML); //this.dataGrid1.DataSource = dom.SelectNodes( "//table "); XmlNodeList nl = dom.SelectNodes( "//tr "); this.SaveXml(nl,sScript); ///保存这个xml文件 } void SaveXml(XmlNodeList nl,string sScript) { string sPath = Path.Combine(Application.StartupPath, "data.xml "); XmlDocument dom = new XmlDocument(); if(!File.Exists(sPath)) { XmlNode nodeRoot = dom.CreateNode(XmlNodeType.Element, "root ", " "); dom.AppendChild(nodeRoot); } else { try { dom.Load(sPath); } catch(Exception er) { MessageBox.Show( "读取xml出错!,请删除后重新生成! "+er.Message, "错误 ",MessageBoxButtons.OK,MessageBoxIcon.Error); return; } } //foreach(XmlNode node in nl) int iNewCount = 0; for(int i=0;i <nl.Count;i++) { XmlNode node = nl[i]; if(i % 2 == 0 ) { XmlNode root = dom.ChildNodes[0]; string sName = node.InnerText; sName = sName.Substring(2).Trim(); if(sName.Substring(0,1) == "、 ") { sName = sName.Substring(1).Trim(); } if(dom.SelectNodes( "//ask[@name= ' "+ sName+ " '] ").Count > 0) { continue; } iNewCount ++; XmlNode ask = dom.CreateNode(XmlNodeType.Element, "ask ", " "); XmlAttribute at = dom.CreateAttribute( " ", "name ", " "); at.Value = sName; ask.Attributes.Append(at); root.AppendChild(ask); i++; node = nl[i]; string[] ary = nl[i].ChildNodes[0].InnerText.Split( '^ '); string[] aryOptions = new string[]{ " "}; for(int j=0; j <ary.Length;j++) { if(ary[j].Trim() == " ") continue; aryOptions = ary[j].Split( '@ '); if(aryOptions.Length == 3) { XmlNode nodeAnswer = dom.CreateNode(XmlNodeType.Element, "ans ", " "); nodeAnswer.InnerText = aryOptions[2]; ask.AppendChild(nodeAnswer); XmlAttribute at0 = dom.CreateAttribute( " ", "id ", " "); at0.Value = aryOptions[0]; nodeAnswer.Attributes.Append(at0); at0 = dom.CreateAttribute( " ", "aid ", " "); at0.Value = aryOptions[1]; nodeAnswer.Attributes.Append(at0); } } //开始在脚本中寻找答案.同一题目的选项的radio的name都是一样的,随便找一个就好. //frmexam.radio37[i].value Regex reg = new Regex(@ "frmexam\. "+aryOptions[0].Trim()+@ "\[i\]\.value\s*==\s* " "([A-E]) " " ");; Match m = reg.Match(sScript); if(m.Success) { XmlAttribute QuestoinA = dom.CreateAttribute( " ", "answer ", " "); QuestoinA.Value = m.Result( "$1 "); ask.Attributes.Append(QuestoinA); } else { this.WriteLog( "没有找到正确答案! "); } //看有没有image string sImageURL = node.ChildNodes[1].InnerText.Trim(); if(sImageURL != " ") { //下载图片并保存 string imgId = Guid.NewGuid().ToString(); new WebClient().DownloadFile( "http://www.gdgajj.com "+sImageURL,imgId); XmlAttribute imgAt = dom.CreateAttribute( " ", "src ", " "); imgAt.Value = imgId; ask.Attributes.Append(imgAt); } } } dom.Save(sPath); this.WriteLog(DateTime.Now.ToString( "yyyy-MM-dd HH:mm:ss ")+ "\t本次搜索到 "+(nl.Count/2).ToString()+ "条,保存 "+iNewCount.ToString()+ "条,总计 "+dom.ChildNodes[0].ChildNodes.Count.ToString()+ "条 "); this.txtLog.Refresh(); //MessageBox.Show( "完成 ", "OK ",MessageBoxButtons.OK,MessageBoxIcon.Information); } void WriteLog(string str) { string sFileName = Path.Combine(Application.StartupPath, "log.txt "); StreamWriter sw = new StreamWriter(sFileName,true,System.Text.Encoding.Default); sw.Write(str + "\r\n "); sw.Close(); this.txtLog.Text +=str+ "\r\n "; this.txtLog.SelectAll(); this.txtLog.ScrollToCaret(); } private void button1_Click(object sender, System.EventArgs e) { this.txtLog.Focus();// for(int i=0;i <10;i++)// { System.Threading.Thread t = new System.Threading.Thread(new System.Threading.ThreadStart(this._Search)); t.Start(); // } } void _Search() { while(true) { this.DoGet(); } } private void Form1_Load(object sender, System.EventArgs e) { this.c= new WebClient(); } }}
  相关解决方案