当前位置: 代码迷 >> C# >> 用c#编纂爬虫在marinetraffic下载船只图片
  详细解决方案

用c#编纂爬虫在marinetraffic下载船只图片

热度:27   发布时间:2016-05-05 03:06:12.0
用c#编写爬虫在marinetraffic下载船只图片

最近在做船只识别方面的事情,需要大量的正样本来训练adaboost分类器。于是到marinetraffic这个网站上下载船只图片。写个爬虫来自动下载显然很方便。

网站特点

在介绍爬虫之前首先了解一下marinetraffic这个网站的一些特点:
1. 会定期检测爬虫行为,如果认为有爬虫大量下载图片。会把该连接加入黑名单,后几天都没办法下载。
2. 船只图片资源差异大,有的船只有1000多张图,有的船只没有一张图,我们需要的是很多船只的很多张图,所以需要对下载的船只按优先级排序。
3. 用来训练分类器的正样本要求检测对象的分辨率一样,而marinetraffic网站下载的图片可以设置下在的图片的宽度,网站根据长宽比,生成相应的高度。所以,不同图片高度不一样,需要自己后期处理。

船只图片

解决方案

  1. 针对爬虫检测,设置一个随机等待时间,10s左右。可以绕过网站爬虫行为检测。
  2. 对船只按照图片熟练排序,先下载图片数量多的,并且每个船只不用下载太多,保证图片的差异性。例如
  3. 在下载的时候使用统一的宽度。后期处理从图片中抠出分辨率一样的船只

爬虫源码

using System;using System.Collections.Generic;using System.Globalization;using System.IO;using System.Linq;using System.Net;using System.Runtime.Serialization.Formatters.Binary;using System.Text;using System.Text.RegularExpressions;using System.Threading;using System.Threading.Tasks;namespace 船只图像爬虫{    class Program    {        static void download_all_shipid(List<string> shipid_list)        {            try            {                WebClient MyWebClient = new WebClient();                MyWebClient.Headers["User-Agent"] = "blah";                MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据;                //Console.WriteLine("here1");                //http://www.marinetraffic.com/en/photos/of/ships/shipid:281519/                //http://www.marinetraffic.com/en/ais/index/ships/all                //http://www.marinetraffic.com/ais/index/ships/all/page:2/sort:COUNT_PHOTOS/direction:desc;                for (int pageNum = 1; pageNum < 100; pageNum++)                {                    Console.WriteLine("开始分析第" + pageNum + "张网页");                    MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据;                    MyWebClient.Headers["User-Agent"] = "blah";                    try                    {                        //Console.WriteLine("here0");                        Byte[] pageData = MyWebClient.DownloadData(@"http://www.marinetraffic.com/en/ais/index/ships/all/page:" + pageNum + "/sort:COUNT_PHOTOS/direction:desc/per_page:50"); //从指定网站下载数据                        //pageHtml = Encoding.Default.GetString(pageData);  //如果获取网站页面采用的是GB2312,则使用这句;                                    string pageHtml = Encoding.UTF8.GetString(pageData); //如果获取网站页面采用的是UTF-8,则使用这句;                        //Console.WriteLine(pageHtml);//在控制台输入获取的内容;                        //Console.WriteLine("here1");                        int urlindex = -1;                        string org_label = "shipid:";                        urlindex = pageHtml.IndexOf(org_label, urlindex + 1);                        while (urlindex != -1)                        {                            int endOfUrl = pageHtml.IndexOf("/", urlindex + org_label.Length);                            //Console.WriteLine("here2");                            string shipid = pageHtml.Substring(urlindex + org_label.Length, endOfUrl - urlindex - org_label.Length);                            if (!shipid_list.Contains(shipid))                            {                                Console.WriteLine("新增id:" + shipid);                                shipid_list.Add(shipid);                            }                            //Console.WriteLine("已有id:" + shipid);                            urlindex = pageHtml.IndexOf(org_label, urlindex + 1);                        }                        ///保存网页                        //using (StreamWriter sw = new StreamWriter("ouput.html"))//将获取的内容写入文本                        //{                        //    sw.Write(pageHtml);                        //}                        Console.WriteLine("完成第" + pageNum + "页分析");                    }                    catch (WebException webEx)                    {                        Console.WriteLine(webEx.Message.ToString());                    }                    //下面是一个随机数的方法保证10秒后再下载,以绕过违规检测。                    Console.Write("绕开网站爬虫行为检测中......");                    Random rd = new Random();                    int time_sleep = rd.Next() % 10 + 10;                    Thread.Sleep(time_sleep * 1000);                    Console.WriteLine();                }                Console.WriteLine("分析结束");                //下面把list内容保存进文件,使用序列化的方法;                string file = @"C:\Users\dragonfive\Desktop\爬虫获得船只图片\第三批\0_100page_shipid.txt";                using (FileStream fsWriter = new FileStream(file, FileMode.OpenOrCreate, FileAccess.Write))                {                    //下面对stu进行序列化;                    BinaryFormatter bf = new BinaryFormatter();                    bf.Serialize(fsWriter, shipid_list);                }            }            catch (WebException webEx)            {                Console.WriteLine(webEx.Message.ToString());            }        }        /// <summary>        /// 根据得到的ship_id获得该ship_id的所有图片;        /// </summary>        /// <param name="ship_id"></param>        static void download_jpg(string ship_id)        {            try            {                Console.WriteLine("开始下载shipid为:"+ship_id+"的图片");                WebClient MyWebClient = new WebClient();                MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据                MyWebClient.Headers["User-Agent"] = "blah";                //http://www.marinetraffic.com/en/photos/of/ships/shipid:281519/                //http://www.marinetraffic.com/en/photos/of/ships/shipid:371668/per_page:1000/page:1                Byte[] pageData = MyWebClient.DownloadData(@"http://www.marinetraffic.com/en/photos/of/ships/shipid:" + ship_id + @"/per_page:100/page:1"); //从指定网站下载数据                //string pageHtml = Encoding.Default.GetString(pageData);  //如果获取网站页面采用的是GB2312,则使用这句                            string pageHtml = Encoding.UTF8.GetString(pageData); //如果获取网站页面采用的是UTF-8,则使用这句                //Console.WriteLine(pageHtml);//在控制台输入获取的内容                Console.WriteLine("元网页已下载");                //using (StreamWriter sw = new StreamWriter("ouput.html"))//将获取的内容写入文本                //{                //    sw.Write(pageHtml);                //}                int urlindex = -1;                string org_label = "data-original='";                urlindex = pageHtml.IndexOf(org_label, urlindex + 1);                int i = 0;                //Directory.CreateDirectory(@"./" );                while (urlindex != -1)                {                    int endOfUrl = pageHtml.IndexOf("'", urlindex + org_label.Length);                    string url = pageHtml.Substring(urlindex + org_label.Length, endOfUrl - urlindex - org_label.Length);                    ////下面是unicode编码转换为string的方式;                    //MatchCollection mc = Regex.Matches(strName, @"\\u([\w]{2})([\w]{2})", RegexOptions.Compiled | RegexOptions.IgnoreCase);                    //byte[] bts = new byte[2];                    //foreach (Match m in mc)                    //{                    //    bts[0] = (byte)int.Parse(m.Groups[2].Value, NumberStyles.HexNumber);                    //    bts[1] = (byte)int.Parse(m.Groups[1].Value, NumberStyles.HexNumber);                    //    musicName += Encoding.Unicode.GetString(bts);                    //}                    //Console.WriteLine("接下来下载的是:" + musicName);                    //下面是一个随机数的方法保证10秒后再下载,以绕过违规检测。                    Console.Write("绕过网站爬虫行为检测中......");                    Random rd = new Random();                    int time_sleep = rd.Next() % 10 + 10;                    Thread.Sleep(time_sleep * 1000);                    Console.WriteLine();                    try                    {                        //这是下载的命令;                        Console.WriteLine(url);                        MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据                        MyWebClient.Headers["User-Agent"] = "blah";                        Byte[] jpgdata = MyWebClient.DownloadData(url); //从指定网页下载数据;                        //把下载的内容保存在一个地方;                        using (FileStream fs = new FileStream(@"C:\Users\dragonfive\Desktop\爬虫获得船只图片\第三批\" + ship_id + "_" + i + ".jpg", FileMode.OpenOrCreate, FileAccess.Write))                        {                            fs.Write(jpgdata, 0, jpgdata.Length);                        }                    }                    catch (WebException webEx)                    {                        Console.WriteLine("被捕获了吗?");                        Console.WriteLine(webEx.Message.ToString());                    }                    Console.WriteLine("成功下载第" + (i ++) + "张图片");                    urlindex = pageHtml.IndexOf(org_label, urlindex + 1);                }                ///保存网页                //using (StreamWriter sw = new StreamWriter("ouput.html"))//将获取的内容写入文本                //{                //    sw.Write(pageHtml);                //}                Console.WriteLine("*****************************************");                Console.WriteLine("下载"+i+"张ship_id为"+ship_id+"的图片");                Console.WriteLine("*****************************************");                //Console.ReadLine(); //让控制台暂停,否则一闪而过了             }            catch (WebException webEx)            {                Console.WriteLine(webEx.Message.ToString());            }        }        static void Main(string[] args)        {            List<string> shipid_list = new List<string>();            //shipid_list.Add("371681");//暂时快速产生图片用这个;            download_all_shipid(shipid_list);            //string file = @"C:\Users\dragonfive\Desktop\爬虫获得船只图片\第三批\0_100page_shipid.txt";            //using (FileStream fsReader = new FileStream(file, FileMode.Open, FileAccess.Read))            //{            //    //下面进行反序列话;            //    BinaryFormatter bf = new BinaryFormatter();            //    shipid_list = (List<string>)bf.Deserialize(fsReader);            //    Console.WriteLine("成功载入" + shipid_list.Count + "个shipid");            //}            ////371652 371668  371681 1252401             //shipid_list.Remove("371652");            //shipid_list.Remove("371668");            //shipid_list.Remove("371681");            //shipid_list.Remove("1252401");            ////132264            //shipid_list.Remove("371077");            //shipid_list.Remove("132264");            //shipid_list.Remove("224871");            //shipid_list.Remove("279923");            //shipid_list.Remove("369163");            //shipid_list.Remove("266342");            //shipid_list.Remove("371216");            //shipid_list.Remove("368174");            //shipid_list.Remove("369163");            foreach (var ship_id in shipid_list)            {                download_jpg(ship_id);            }            Console.ReadLine(); //让控制台暂停,否则一闪而过了         }    }}

版权声明:本文为博主原创文章,欢迎转载和分享,但请声明出处http://blog.csdn.net/zhzz2012

  相关解决方案