15
07/2014
c#爬取网站的方法
方法1、用自带的webclient,代码如下:
/** * 用webclient爬取数据 */ WebClient client = new WebClient(); // Add a user agent header in case the // requested URI contains a query. client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)"); Stream data = client.OpenRead("http://www.baidu.com"); StreamReader reader = new StreamReader(data); string s = reader.ReadToEnd(); Console.WriteLine(s); data.Close(); reader.Close(); Console.ReadKey(); return;
方法2、用httpwebrequest爬取数据,代码如下:
/** * 用httpwebrequest爬取数据 */ string m_html=""; int m_pagesize=0; HttpWebRequest myReq = (HttpWebRequest)WebRequest.Create("http://www.baidu.com/"); myReq.Accept = @"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"; myReq.Method = "GET"; myReq.Headers.Add(HttpRequestHeader.AcceptLanguage, @"zh-CN,zh;q=0.8"); myReq.UserAgent = @"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"; myReq.KeepAlive = true; HttpWebResponse rsps = (HttpWebResponse)myReq.GetResponse(); Stream sm = rsps.GetResponseStream(); if (!rsps.ContentType.ToLower().StartsWith("text/") || rsps.ContentLength > 1 << 22) { rsps.Close(); return; } Encoding cding = Encoding.Default; string contenttype=rsps.ContentType.ToLower(); int ix = contenttype.IndexOf("charset="); if (ix != -1) { try { cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + "charset".Length + 1)); } catch { cding = Encoding.Default; } Console.WriteLine("{0}", cding); m_html = new StreamReader(sm, cding).ReadToEnd(); } else { m_html = new StreamReader(sm, cding).ReadToEnd(); Regex regex = new Regex("charset=(?<cding>[^=]+)?",RegexOptions.IgnoreCase); string strcding = regex.Match(m_html).Groups["cding"].Value; try { cding = Encoding.GetEncoding(strcding); } catch{ cding = Encoding.Default; } byte[] bytes=Encoding.Default.GetBytes(m_html.ToCharArray()); m_html = cding.GetString(bytes); if (m_html.Split('?').Length > 100) { m_html=Encoding.Default.GetString(bytes); } } m_pagesize = m_html.Length; Console.WriteLine("{0}", m_html); rsps.Close(); Console.ReadKey();
0 条评论