当前位置: 代码迷 >> Java相关 >> 用java实现新浪爬虫,代码完整辨析(仅针对当前SinaSignOn有效)
  详细解决方案

用java实现新浪爬虫,代码完整辨析(仅针对当前SinaSignOn有效)

热度:76   发布时间:2016-04-22 20:10:19.0
用java实现新浪爬虫,代码完整剖析(仅针对当前SinaSignOn有效)

先来看我们的web.xml文件,如下

 1 <!DOCTYPE web-app PUBLIC 2  "-//Sun Microsystems, Inc.//DTD Web Application 2.3//EN" 3  "http://java.sun.com/dtd/web-app_2_3.dtd" > 4  5 <web-app> 6   <display-name>MySinaSpider</display-name> 7     <listener> 8         <listener-class>main.java.sina.spider.StartSpiderLisenter</listener-class> 9       </listener>10 </web-app>

这样的配置当启动tomcat的时候,就会运行爬虫,然后再看我们的StartSpiderLisenter类,如下

 1 package main.java.sina.spider; 2  3 import javax.servlet.ServletContextEvent; 4 import javax.servlet.ServletContextListener; 5 import main.java.sina.bean.info.LoginInfo; 6 import main.java.sina.utils.Constant; 7  8 public class StartSpiderLisenter implements ServletContextListener{ 9 10     public void contextDestroyed(ServletContextEvent arg0) {11         12     }13 14     public void contextInitialized(ServletContextEvent arg0) {15         Constant.personalHomePage = "http://weibo.com/zhaoyao2012/home"; //填写你自己的新浪微博个人主页16         LoginInfo.username = "***"; //填写你的新浪微博用户名18         LoginInfo.password = "***"; //填写你的新浪微博密码19         Constant.enableProxy = false; //是否使用代理20         Spider.start();21     }22 23 }

很明显我们看到StartSpiderLisenter 类是继承自ServletContextListener这个接口,一定要实现它的两个方法,contextInitialized和contextDestroyed.它们分别在初始化和销毁的时候被容器调用。我们看到在contextInitialized初始化上下文的方法中调用了Spider.start()方法。那么我们来看看Spider这个类,如下:

  1 package main.java.sina.spider;  2   3 import java.io.IOException;  4 import java.util.regex.Matcher;  5 import java.util.regex.Pattern;  7 import org.quartz.JobBuilder;  8 import org.quartz.JobDetail;  9 import org.quartz.Scheduler; 10 import org.quartz.SchedulerException; 11 import org.quartz.SchedulerFactory; 12 import org.quartz.SimpleScheduleBuilder; 13 import org.quartz.SimpleTrigger; 14 import org.quartz.TriggerBuilder; 15 import org.quartz.impl.StdSchedulerFactory; 17 import main.java.sina.bean.info.LoginInfo; 18 import main.java.sina.httpclient.LoginSina; 19 import main.java.sina.httpclient.SpiderSina; 20 import main.java.sina.job.KeywordSearchJob; 21 import main.java.sina.utils.Constant; 22 import main.java.sina.utils.HttpHelper; 23 import main.java.test.SpiderTest; 24  25 public class Spider { 26  27     public static void main(String[] args) { 28  29         Constant.personalHomePage = "****";     30         LoginInfo.username = "****"; 31         LoginInfo.password = "****"; 32         Constant.enableProxy = false; 33         Constant.hourbefore = 0;  //这个参数用于设置时差 34         start(); 35          36     } 37     public static void start() { 38          39         final SchedulerFactory factory = new StdSchedulerFactory(); 40         try { 41             Scheduler scheduler = factory.getScheduler(); 42             JobDetail jobDetail = JobBuilder.newJob(KeywordSearchJob.class) 43                     .withIdentity("keywordSearch", "weibo").build(); 44             SimpleTrigger trigger = TriggerBuilder.newTrigger() 45                     .withIdentity("keywordSearch", "weibo") 46                     .withSchedule(SimpleScheduleBuilder.repeatHourlyForever()) 47                     .build(); 48             scheduler.scheduleJob(jobDetail, trigger); 49             scheduler.start(); 50         } catch (SchedulerException e) { 51             e.printStackTrace(); 52         } 53     } 54  55     public static SpiderSina createSpider() { 56         LoginSina ls = new LoginSina(LoginInfo.username, LoginInfo.password); 57         ls.dologinSina(); 58         ls.redirect(); 59         SpiderSina spider = new SpiderSina(ls); 60  61         return spider; 62     } 63  64     public static void sendMidsofDays(SpiderSina spider,String keyword, String fromdate, 65             String todate) { 66          67         try { 68             String midsString = ""; 69             for (int i = 1; i <= 50; i++) { 70                 String htmlContent = spider 71                         .search(keyword, i, fromdate, todate); 72                 if (htmlContent.contains("noresult_support")) { 73                     break; 74                 } 75                 System.out.println(i); 76                 Pattern pattern = Pattern.compile("<div mid=\"([0-9]*)\""); 77  78                 String start = "\"pid\":\"pl_weibo_direct\""; 79                 try { 80                     htmlContent = htmlContent.substring(htmlContent 81                             .indexOf(start)); 82                 } catch (Exception e) { 83                     htmlContent = htmlContent.substring(1); 84                 } 85                 htmlContent = htmlContent.replace("\\\"", "\""); 86                 htmlContent = htmlContent.replace("\\/", "/"); 87                 Matcher matcher = pattern.matcher(htmlContent); 88                 while (matcher.find()) { 89                     System.out.println(matcher.group(1)); 90                     midsString += matcher.group(1) + ","; 91                 } 92                 if (i == 37) { 93                     try { 94                         Thread.sleep(1000 * 60 * 30); 95                     } catch (InterruptedException e) { 96                         e.printStackTrace(); 97                     } 98                 } 99             }100             System.out.println(midsString);101             HttpHelper.getLiveData(midsString, Constant.CommentUrl);102         } catch (IOException e) {103             e.printStackTrace();104         }105 106     }107 }

我们在Spider.start()方法中,看到了作业KeywordSearchJob.class,那么我们来看看这个KeywordSearchJob类的实现,如下:

 1 package main.java.sina.job; 2  3 import org.quartz.Job; 4 import org.quartz.JobExecutionContext; 5 import org.quartz.JobExecutionException; 6 import main.java.sina.httpclient.SpiderSina; 7 import main.java.sina.spider.Spider; 8 import main.java.sina.utils.Constant; 9 import main.java.sina.utils.Utils;10 11 public class KeywordSearchJob implements Job {12 13     public void execute(JobExecutionContext arg0) throws JobExecutionException {14 15         Constant.enableProxy = false; //我的爬虫中没有使用代理,故值设为false.16         String keyword = "%25E5%25AE%2581%25E6%25B3%25A2%25E5%25A4%25A7%25E5%25AD%25A6";//被编码后的关键字17         String datehour = Utils.getDateOfSpecifiedPreHour(Constant.hourbefore);//这个工具类实现了时差格式的转换18         SpiderSina spider = Spider.createSpider();19         spider.forwardToWeiboPage();20         Spider.sendMidsofDays(spider,keyword,datehour,datehour);21     }22 23 }

接下来,我们看几个工具类的实现:首先来看下Utils.java这个类,如下:它实现了日期的格式的一些转换

  1 package main.java.sina.utils;  2   3 import java.io.BufferedReader;  4 import java.io.BufferedWriter;  5 import java.io.File;  6 import java.io.FileInputStream;  7 import java.io.FileNotFoundException;  8 import java.io.FileOutputStream;  9 import java.io.FileWriter; 10 import java.io.IOException; 11 import java.io.InputStream; 12 import java.io.InputStreamReader; 13 import java.io.StringReader; 14 import java.io.UnsupportedEncodingException; 15 import java.text.ParseException; 16 import java.text.SimpleDateFormat; 17 import java.util.Calendar; 18 import java.util.Date; 19 import java.util.Properties; 20  21 import org.htmlparser.Parser; 22 import org.htmlparser.lexer.Lexer; 23 import org.htmlparser.lexer.Page; 24 import org.htmlparser.util.DefaultParserFeedback; 25 //  I/O操作类 26 public class Utils { 27      28     public static Date getDateFromString(String dtext,Date fileCreateDate) { 29         Date date=null; 30         int y,mm,se;   31         Calendar c = Calendar.getInstance();   32         c.setTime(fileCreateDate); 33         y = c.get(Calendar.YEAR); // 34         //d = c.get(Calendar.DAY_OF_MONTH); // 35         mm = c.get(Calendar.MINUTE); // 36         se = c.get(Calendar.SECOND);// 37         if(dtext.contains("秒前")){ 38             int end=0; 39             for(int i=0;i<dtext.length();i++){ 40                 if(dtext.charAt(i)>='0' && dtext.charAt(i)<='9'){ 41                     end++; 42                 }else{ 43                     break; 44                 } 45             } 46             dtext=dtext.substring(0,end); 47             int second=Integer.parseInt(dtext); 48             c.set(Calendar.SECOND, se-second); 49             date=c.getTime(); 50         } 51         else if(dtext.contains("分钟前")){ 52             int end=0; 53             for(int i=0;i<dtext.length();i++){ 54                 if(dtext.charAt(i)>='0' && dtext.charAt(i)<='9'){ 55                     end++; 56                 }else{ 57                     break; 58                 } 59             } 60             dtext=dtext.substring(0,end); 61             int minute=Integer.parseInt(dtext); 62             c.set(Calendar.MINUTE, mm-minute); 63             date=c.getTime(); 64         }else if(dtext.contains("今天")){ 65              dtext=dtext.replace("今天 ", "").trim(); 66              String ss[]=dtext.split(":"); 67              if(ss!=null && ss.length==2){ 68                  c.set(Calendar.HOUR_OF_DAY, Integer.parseInt(ss[0])); 69                  c.set(Calendar.MINUTE, Integer.parseInt(ss[1])); 70                  date=c.getTime(); 71              } 72         }else if(dtext.contains("月")){ 73             dtext=y+"年".concat(dtext); 74             SimpleDateFormat sf=new SimpleDateFormat("yyyy年MM月dd日 HH:mm"); 75             try { 76                 date=sf.parse(dtext); 77             } catch (ParseException e) { 78                 e.printStackTrace(); 79             } 80         }else if(dtext.contains("-")){ 81             SimpleDateFormat sf=new SimpleDateFormat("yyyy-MM-dd HH:mm"); 82             try { 83                 date=sf.parse(dtext); 84             } catch (ParseException e) { 85                 e.printStackTrace(); 86             } 87         } 88         return date; 89     } 90     public static void writeFileFromStream(String filename,InputStream in){ 91         if(filename==null || filename.trim().length()==0) 92             return; 93         File file=new File(filename); 94         if(!file.exists()){ 95             try { 96                 file.createNewFile(); 97             } catch (IOException e) { 98                 e.printStackTrace(); 99             }100         }101         FileOutputStream fou=null;102         try {103             fou = new FileOutputStream(file);104             byte []buffer=new byte[1024*4];105             int len=-1;106             while((len=in.read(buffer))!=-1){107                 fou.write(buffer,0,len);108             }109         } catch (FileNotFoundException e) {110             e.printStackTrace();111         } catch (IOException e) {112             e.printStackTrace();113         }finally{114             if(in!=null)115                 try {116                     in.close();117                 } catch (IOException e) {118                     e.printStackTrace();119                 }120             if(fou!=null)121                 try {122                     fou.close();123                 } catch (IOException e) {124                     e.printStackTrace();125                 }126         }127     }    128     public static void writeFileFromString(String filename,String str){129         if(filename==null || filename.trim().length()==0)130             filename="tmp.txt";131         File file=new File(filename);132         if(!file.exists()){133             try {134                 file.createNewFile();135             } catch (IOException e) {136                 e.printStackTrace();137             }138         }139         BufferedWriter writer=null;140         BufferedReader reader=null;141         try {142             writer=new BufferedWriter(new FileWriter(file));143             reader=new BufferedReader(new StringReader(str));144             String tmp=null;145             StringBuffer buffer=new StringBuffer();146             while((tmp=reader.readLine())!=null)147                 buffer.append(tmp+"\n");148             writer.write(buffer.toString());149             150         } catch (IOException e) {151             e.printStackTrace();152         }finally{153             try {154                 reader.close();155                 writer.close();156             } catch (IOException e) {157                 e.printStackTrace();158             }159         }160         161     }162     163     164     165     public static String getStringFromStream(InputStream in) {166         BufferedReader reader=null;167         reader = new BufferedReader(new InputStreamReader(in));168         StringBuffer buffer=new StringBuffer();169         String str=null;170         try{171             while((str=reader.readLine())!=null){172                 buffer.append(str+"\n");173             }    174             reader.close();175         }catch(Exception ex){176             ex.printStackTrace();177         }            178         try {179             return new String(buffer.toString().getBytes(),"utf-8");180         } catch (UnsupportedEncodingException e) {181             e.printStackTrace();182             return "error:"+e.getMessage();183         }184     }185   //得到数据库的配置信息186     public static Properties getDBconfig(){187         Properties properties=new Properties();188         InputStream in = null;189         try {190             in = new FileInputStream(new File("config/dbconfig.ini"));191             properties.load(in);192         } catch (FileNotFoundException e) {193             e.printStackTrace();194         } catch (IOException e) {195             e.printStackTrace();196         }finally{197             if(in!=null)198                 try {199                     in.close();200                 } catch (IOException e) {201                     e.printStackTrace();202                 }203         }204         return properties;205     }206     207     public static Parser createParser(String inputHTML) {208         Lexer mLexer = new Lexer(new Page(inputHTML));209         Parser parser = new Parser(mLexer, new DefaultParserFeedback(210                 DefaultParserFeedback.QUIET));211         return parser;212     }213     214     public static String getDateOfSpecifiedPreHour(int hourNum){215         SimpleDateFormat sdFormat = new SimpleDateFormat("yyyy-MM-dd-HH");216         Date date = new Date();217         System.out.println("date -" +date + " " + hourNum);218         Calendar calendar = Calendar.getInstance();219         calendar.setTime(date);220         calendar.add(Calendar.HOUR_OF_DAY, -1 * hourNum);221         System.out.println("date2 -" +sdFormat.format(calendar.getTime()));222         return sdFormat.format(calendar.getTime());223     }    224 }

再来看一下ThreadPool.java这个类,如下:这是一个线程工具类,定义了线程的一些动作

 1 package main.java.sina.utils; 2  3 import java.util.List; 4 import java.util.concurrent.ExecutorService; 5 import java.util.concurrent.Executors; 6  7 /** 9  * 线程池工具类10  */11 public class ThreadPool {12     private ExecutorService service;13     private List<Thread> threadList;14 15     public ThreadPool(int limite, List<Thread> threadList) {16         this.service = Executors.newFixedThreadPool(limite);17         this.threadList = threadList;18     }19 20     public void execute() {21         if(threadList==null ||threadList.size()==0) return ;22         for (int index = 0; index < threadList.size(); index++) {23             Thread t=threadList.get(index);24             service.execute(t);25         }26     }27     public boolean isTerminated(){28         return service.isTerminated();29     }30     31     public void shutDown() {32         service.shutdown();33     }34 }

然后再看一下Constant.java这个常量类,如下:常量类把系统总用到的一些常量写在这里,以后项目维护需要更改的时候,方便维护更改

package main.java.sina.utils;/** * @ClassName: Constant  *  */public class Constant {    public static boolean enableProxy = false;    public static String liveCommentUrl = "http://localhost:8080/social-hub-connector/loadingLiveData";    public static String CommentUrl = "http://localhost:8080/social-hub-connector/loadingData";    public static String personalHomePage = "******";    public static String weiboUsername = "*********";    public static String weiboPassword = "*********";    public static int hourbefore = 0;}

再来看一下Base64Encoder.java类,它对一些字段进行了编码的类,如下:

 1 package main.java.sina.utils; 2  3 /** 4  *  5  */ 6 public class Base64Encoder { 7     private static final char last2byte = (char) Integer.parseInt("00000011", 2); 8     private static final char last4byte = (char) Integer.parseInt("00001111", 2); 9     private static final char last6byte = (char) Integer.parseInt("00111111", 2);10     private static final char lead6byte = (char) Integer.parseInt("11111100", 2);11     private static final char lead4byte = (char) Integer.parseInt("11110000", 2);12     private static final char lead2byte = (char) Integer.parseInt("11000000", 2);13     private static final char[] encodeTable = new char[]{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'};14 15     public Base64Encoder() {16     }17     public static  String encode(byte[] from) {18         StringBuffer to = new StringBuffer((int) (from.length * 1.34) + 3);19         int num = 0;20         char currentByte = 0;21         for (int i = 0; i < from.length; i++) {22             num = num % 8;23             while (num < 8) {24                 switch (num) {25                     case 0:26                         currentByte = (char) (from[i] & lead6byte);27                         currentByte = (char) (currentByte >>> 2);28                         break;29                     case 2:30                         currentByte = (char) (from[i] & last6byte);31                         break;32                     case 4:33                         currentByte = (char) (from[i] & last4byte);34                         currentByte = (char) (currentByte << 2);35                         if ((i + 1) < from.length) {36                             currentByte |= (from[i + 1] & lead2byte) >>> 6;37                         }38                         break;39                     case 6:40                         currentByte = (char) (from[i] & last2byte);41                         currentByte = (char) (currentByte << 4);42                         if ((i + 1) < from.length) {43                             currentByte |= (from[i + 1] & lead4byte) >>> 4;44                         }45                         break;46                 }47                 to.append(encodeTable[currentByte]);48                 num += 6;49             }50         }51         if (to.length() % 4 != 0) {52             for (int i = 4 - to.length() % 4; i > 0; i--) {53                 to.append("=");54             }55         }56         return to.toString();57     }58 }

这个类中,针对新浪的一些特殊的加密规则,写的方法,这个在拼接最终的URl的时候回用到,如根据servertime+nonce两个参数来生成一串字符串加密规则:

 1 package main.java.sina.utils; 2 import java.io.File; 3 import java.io.FileReader; 4  5 import javax.script.Invocable; 6 import javax.script.ScriptEngine; 7 import javax.script.ScriptEngineManager; 8  9 /**10  * 12  */13 public class EncodeSuAndSp {14     static ScriptEngineManager mgr = new ScriptEngineManager();  15     static ScriptEngine engine = mgr.getEngineByExtension("js");16     static Invocable inv = (Invocable) engine;   17       18     public static String getEncryptedP(String password,String servertime,String nonce){19         String value1="";20         try { 21             engine.eval(new FileReader(new File("js/encrypt.js")));22             value1 = String.valueOf(inv.invokeFunction("hex_sha1",password));23             value1 = String.valueOf(inv.invokeFunction("hex_sha1",value1));24             value1 = String.valueOf(inv.invokeFunction("hex_sha1",value1+servertime+nonce));25         } catch (Exception e) {26             e.printStackTrace();27         }28         return value1;29     }30     31     32     public static String getEncodedUsername(String username){33         String value1="";34         try {35             engine.eval(new FileReader(new File("js/encrypt.js")));36             value1 = String.valueOf(inv.invokeFunction("encode",username));37             System.out.println(value1);38         } catch (Exception e) {39             e.printStackTrace();40         }41         return value1;42     }43 }
package main.java.sina.utils;import java.io.UnsupportedEncodingException;import java.net.URLDecoder;import java.net.URLEncoder;public class EncodeUtils {    public static final String encodeURL(String str,String enc) {        try {            return URLEncoder.encode(str, enc);        } catch (UnsupportedEncodingException e) {            throw new RuntimeException(e);        }    }    public static final String decodeURL(String str,String enc) {        try {            return URLDecoder.decode(str, enc);        } catch (UnsupportedEncodingException e) {            throw new RuntimeException(e);        }    }        public static String unicdoeToGB2312(String str) {        String res = null;        if(str==null ){            return "";        }        StringBuffer sb = new StringBuffer();        try {            while (str.length() > 0) {                if (str.startsWith("\\u")) {                    int x = 0;                    try{                        x = Integer.parseInt(str.substring(2, 6), 16);                    }catch(Exception ex){                        x=  0;                    }                    sb.append((char) x);                    str = str.substring(6);                } else {                    sb.append(str.charAt(0));                    str = str.substring(1);                }            }            res = sb.toString();        } catch (Exception e) {            e.printStackTrace(System.err);        }        res=res.replaceAll("\\\\r", "")            .replaceAll("\\\\n", "")            .replaceAll("\\\\t", "")            .replaceAll("&nbsp;", "")            .replaceAll("&gt", "")            .replaceAll("\\[", "\"")            .replaceAll("\\]", "\"");        return res;    }        public static String unicodeTogb2312(String str) {        String res = null;        StringBuffer sb = new StringBuffer();        try {            while (str.length() > 0) {                if (str.startsWith("\\u")) {                    int x = Integer.parseInt(str.substring(2, 6), 16);                    sb.append((char) x);                    str = str.substring(6);                } else {                    sb.append(str.charAt(0));                    str = str.substring(1);                }            }            res = sb.toString();        } catch (Exception e) {            e.printStackTrace(System.err);        }        res=res.replaceAll("\\\\r", "")                .replaceAll("\\\\t", "")                .replaceAll("&nbsp;", "")                .replaceAll("&gt", "")               .replaceAll("\\\\n", "");        return res;    }}

这个类很关键HttpUtils.java类,这个方法中重写了doPost()和doGet()方法.如下:

package main.java.sina.utils;import java.io.ByteArrayInputStream;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStream;import java.util.ArrayList;import java.util.List;import java.util.Map;import java.util.Random;import java.util.Set;import org.apache.http.Header;import org.apache.http.HttpEntity;import org.apache.http.HttpHost;import org.apache.http.HttpResponse;import org.apache.http.HttpVersion;import org.apache.http.NameValuePair;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.HttpClient;import org.apache.http.client.entity.UrlEncodedFormEntity;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.client.methods.HttpUriRequest;import org.apache.http.conn.params.ConnRoutePNames;import org.apache.http.conn.params.ConnRouteParams;import org.apache.http.cookie.Cookie;import org.apache.http.entity.InputStreamEntity;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;import org.apache.http.impl.cookie.BasicClientCookie;import org.apache.http.message.BasicNameValuePair;import org.apache.http.params.BasicHttpParams;import org.apache.http.params.CoreProtocolPNames;import org.apache.http.params.HttpParams;import org.apache.http.params.HttpProtocolParams;import org.apache.http.protocol.BasicHttpContext;import org.apache.http.protocol.ExecutionContext;import org.apache.http.protocol.HTTP;import org.apache.http.protocol.HttpContext;/** * http操作相关的类 */public class HttpUtils {    /*     * params :     * url:  地址     * headers请求头部信息     * return : httpresponse响应     */    public static HttpResponse doGet(String url,Map<String,String> headers){        HttpClient client=createHttpClient();        HttpGet getMethod=new HttpGet(url);        HttpResponse response=null;                HttpContext httpContext = new BasicHttpContext();        try {            if(headers!=null && headers.keySet().size()>0){                for(String key:headers.keySet()){                    getMethod.addHeader(key, headers.get(key));                }            }                response=client.execute(getMethod);            HttpUriRequest realRequest  = (HttpUriRequest)httpContext.getAttribute(ExecutionContext.HTTP_REQUEST);            System.out.println(realRequest.getURI());        } catch (ClientProtocolException e) {            e.printStackTrace();        } catch (IOException e) {            String msg=e.getMessage();            if(msg.contains("Truncated chunk")){                System.out.println(e.getMessage() +" 数据获取不完整,需要重新获取。");            }else{                System.out.println(e.getMessage() +" 连接被拒绝,需要降低爬取频率。");            }        } catch(Exception e){        }        System.out.println(response);        return response;            }        /*     * params :     * url:  地址     * headers:请求头部信息     * params:post的请求数据     * return : httpresponse响应     */        public static HttpResponse doPost(String url,Map<String,String> headers,Map<String,String> params){        HttpClient client=createHttpClient();        HttpPost postMethod=new HttpPost(url);        HttpResponse response=null;        try {            if(headers!=null && headers.keySet().size()>0){                for(String key:headers.keySet()){                    postMethod.addHeader(key, headers.get(key));                }            }                List<NameValuePair> p=null;            if(params!=null && params.keySet().size()>0){                p=new ArrayList<NameValuePair>();                for(String key:params.keySet()){                    p.add(new BasicNameValuePair(key,params.get(key)));                }            }            if(p!=null)                postMethod.setEntity(new UrlEncodedFormEntity(p,HTTP.UTF_8));            response=client.execute(postMethod);        } catch (ClientProtocolException e) {            e.printStackTrace();        } catch (IOException e) {            e.printStackTrace();        }         return response;                }        //上传一个文件    public static HttpResponse doPost(String url,Map<String,String> headers,String fileName){        HttpClient client=createHttpClient();        HttpPost postMethod=new HttpPost(url);        String boundary = "";        HttpResponse response=null;        try {            if(headers!=null && headers.keySet().size()>0){                for(String key:headers.keySet()){                    postMethod.addHeader(key, headers.get(key));                    if(key.equals("Content-Type")){                        String tmp=headers.get(key);                        boundary=tmp.substring(tmp.indexOf("=")+1);                    }                }            }                File file=new File(fileName);            InputStream in=new FileInputStream(file);                        StringBuffer buffer=new StringBuffer();            buffer.append(boundary).append("\n")                  .append("Content-Disposition: form-data; name=\"pic1\"; filename=\""+file.getName()).append("\"\n")                  .append("Content-Type: image/pjpeg").append("\n")                  .append("\n");                        System.out.println(buffer.toString());                        String tmpstr=Utils.getStringFromStream(in);            tmpstr=Base64Encoder.encode(tmpstr.getBytes());            buffer.append(tmpstr).append("\n");            buffer.append(boundary+"--").append("\n");                        System.out.println(buffer.toString());                        in=new ByteArrayInputStream(buffer.toString().getBytes());                        InputStreamEntity ise=new InputStreamEntity(in,buffer.toString().getBytes().length);                          postMethod.setEntity(ise);                          response=client.execute(postMethod);        } catch (ClientProtocolException e) {            e.printStackTrace();        } catch (IOException e) {            e.printStackTrace();        }         return response;                }    /*     * params :     * httpresponse     * return : 响应的头部信息     */        public static List<Header> getReponseHeaders(HttpResponse response){        List<Header> headers=null;        Header[] hds=response.getAllHeaders();        if(hds!=null && hds.length>0){            headers=new ArrayList<Header>();            for(int i=0;i<hds.length;i++){                headers.add(hds[i]);            }        }                return headers;    }        /*      * params :      * headers:头部信息       * request:请求     */    public static void setHeaders(Map<String,String> headers,HttpUriRequest request){        if(headers!=null && headers.keySet().size()>0){            for(String key:headers.keySet()){                request.addHeader(key, headers.get(key));            }        }    }        /*     * params :     * httpresponse     * return : 响应的cookies值     */        public static List<Cookie> getResponseCookies(HttpResponse response){        List<Cookie> cookies=null;        Header[] hds=response.getAllHeaders();        if(hds!=null && hds.length>0){            for(int i=0;i<hds.length;i++){                if(hds[i].getName().equalsIgnoreCase("Set-Cookie")){                    if(cookies==null){                        cookies=new ArrayList<Cookie>();                    }                                         String cookiestring[]=hds[i].getValue().split(";");                    String ss[]=cookiestring[0].split("=",2);                    String cookiename=ss[0];                    String cookievalue=ss[1];                    Cookie cookie=new BasicClientCookie(cookiename,cookievalue);                    cookies.add(cookie);                }            }        }                return cookies;    }    /*     * params :     * cookies数组     * return : cookies数组组成的字符串     */    public static String setCookie2String(List<Cookie> cookies){        StringBuilder builder=null;         if(cookies!=null && cookies.size()>0){            builder=new StringBuilder();            for(int j=0;j<cookies.size();j++){                Cookie c=cookies.get(j);                builder.append(c.getName()+"="+c.getValue());                if(j!=cookies.size()-1)                    builder.append("; ");             }            return builder.toString();        }                return null;    }        /*     * 从响应中得到输入流     */    public static InputStream getInputStreamFromResponse(HttpResponse response){        if(response==null){            return null;        }        HttpEntity entity=response.getEntity();        InputStream in=null;        try {            in = entity.getContent();        } catch (IllegalStateException e) {            e.printStackTrace();        } catch (IOException e) {            e.printStackTrace();        }        return  in;    }        /*     * 从响应中得到字符串     */    public static String getStringFromResponse(HttpResponse response){        if(response==null){            return null;        }        InputStream in=getInputStreamFromResponse(response);        String responseText="";        if(in!=null){            responseText=Utils.getStringFromStream(in);        }        return responseText;    }        /**     * 创建支持多线程并发连接的HTTPCLIENT     */    private final static HttpClient createHttpClient() {         String proxyHost = "web-proxy-sha.chn.hp.com";         int proxyPort = 8080;         HttpHost proxy = new HttpHost(proxyHost,proxyPort);        HttpParams params = new BasicHttpParams();        if(Constant.enableProxy){            params.setParameter(ConnRouteParams.DEFAULT_PROXY, proxy);        }        HttpProtocolParams.setVersion(params, HttpVersion.HTTP_1_1);        HttpProtocolParams.setContentCharset(params, "UTF-8");                ThreadSafeClientConnManager clientmanager = new ThreadSafeClientConnManager();        clientmanager.setMaxTotal(20);        HttpClient client = new DefaultHttpClient(clientmanager, params);                //定义了环形重定向,定向到相同的路径是否被允许.        client.getParams().setParameter("http.protocol.allow-circular-redirects", true);                 //定义了重定向的最大数量        client.getParams().setParameter("http.protocol.max-redirects", 50);                //定义了重定向是否应该自动处理        client.getParams().setParameter("http.protocol.handle-redirects", false);        return client;    }        /**     *加入代理的功能     * @return HttpClient 对象     */    public static HttpClient getDefaultHttpClientByProxy() {        HttpClient httpclient =createHttpClient();        String filePath = "proxy.properties";        HttpHost proxy = null;        Map<String, String> map = ReadIni.getDbini(filePath);        if (map.size() == 0) {            throw new RuntimeException("无可用代理");        } else {            Set<String> set = map.keySet();            String[] array = (String[]) set.toArray(new String[set.size()]);            Random r = new Random();            int rnum = r.nextInt(array.length);            String ip = array[rnum];            String port = map.get(ip);            proxy = new HttpHost(ip, Integer.parseInt(port));        }        httpclient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,proxy);        httpclient.getParams().setParameter(CoreProtocolPNames.PROTOCOL_VERSION, HttpVersion.HTTP_1_1);        return httpclient;    }}

接下来卡一个HttpHelper的辅助类,如下:

/** *  */package main.java.sina.utils;import java.io.IOException;import org.apache.commons.httpclient.HttpClient;import org.apache.commons.httpclient.HttpException;import org.apache.commons.httpclient.methods.PostMethod;/** * @ClassName: HttpHelper *  */public class HttpHelper {    public static String getLiveData(String requestData,String url)            throws HttpException, IOException {        PostMethod postMethod = new PostMethod(url);        postMethod.setParameter("mids", requestData);        HttpClient httpClient = new HttpClient();        int statusCode = httpClient.executeMethod(postMethod);        String response = postMethod.getResponseBodyAsString();        postMethod.releaseConnection();        System.out.println(response);        return response;    }        public static String getHobbyData(String userid, String hobbys)            throws HttpException, IOException {        PostMethod postMethod = new PostMethod("http://c0048925.itcs.hp.com:8080/connector/loadingHobby");        postMethod.setParameter("userid", userid);        postMethod.setParameter("hobbys", hobbys);        HttpClient httpClient = new HttpClient();        int statusCode = httpClient.executeMethod(postMethod);        String response = postMethod.getResponseBodyAsString();        postMethod.releaseConnection();        System.out.println(response);        return response;    }}

ReadIni.java类,在读文本文件中使用,如下:

package main.java.sina.utils; import java.io.BufferedReader;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.InputStreamReader;import java.util.HashMap;import java.util.Map;public class ReadIni {    public static Map<String, String> getDbini(String file) {        Map<String, String> map = new HashMap<String, String>();        InputStreamReader isr = null;        try{            isr = new InputStreamReader(new FileInputStream(file));        } catch (FileNotFoundException e1) {            e1.printStackTrace();        }        BufferedReader br = new BufferedReader(isr);        String s = null;        try {            s = br.readLine();            while (s != null) {                if (s.trim().length() > 0) {                    String[] s1 = getIni(s);                    map.put(s1[0], s1[1]);                    s = br.readLine();                }            }            br.close();            isr.close();        } catch (Exception e) {            e.printStackTrace();        }        return map;    }    public static String[] getIni(String str) {        String[] temp = str.split("=");        return temp;    }}

然后,我们跳转到登录sina,来看一下loginSina这个类的实现:

package main.java.sina.httpclient;import java.io.IOException;import java.io.InputStream;import java.io.UnsupportedEncodingException;import java.math.BigInteger;import java.security.InvalidKeyException;import java.security.KeyFactory;import java.security.NoSuchAlgorithmException;import java.security.interfaces.RSAPublicKey;import java.security.spec.InvalidKeySpecException;import java.security.spec.RSAPublicKeySpec;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.Scanner;import javax.crypto.BadPaddingException;import javax.crypto.Cipher;import javax.crypto.IllegalBlockSizeException;import javax.crypto.NoSuchPaddingException;import org.apache.commons.codec.binary.Hex;import org.apache.commons.httpclient.params.HttpParams;import org.apache.http.HttpResponse;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.cookie.Cookie;import org.springframework.core.io.ClassPathResource;import main.java.sina.json.msg.PreLoginResponseMessage;import main.java.sina.utils.Base64Encoder;import main.java.sina.utils.EncodeUtils;import main.java.sina.utils.HttpUtils;import main.java.sina.utils.JsonUtils;import main.java.sina.utils.Utils;public class LoginSina {    private String username;    private String password;    private String rsakv;    private String pubkey;        //servertime和nonce都是在登录时需要使用的,用于post信息的加密    private String servertime;//服务器的时间    private String nonce;//一次性字符串    private String userid;//用户微博ID    private String pcid;//若需要输入验证码时用到    private String userdomainname;//用于域名    private String door;//验证码        private Map<String,String> headers=null;        private List<Cookie> cookies=null;            public LoginSina(String username,String password){        this.username=username;        this.password=password;        init();    }        public Map<String,String> getHeaders(){        Map<String,String> hds=null;        if(headers!=null && headers.keySet().size()>0){            hds=new HashMap<String,String>();            for(String key:headers.keySet()){                hds.put(key,headers.get(key));            }        }        return hds;    }        public List<Cookie> getCookies(){        List<Cookie> cc=null;        if(cookies!=null && cookies.size()>0){            cc=new ArrayList<Cookie>();            for(int i=0;i<cookies.size();i++){                cc.add(cookies.get(i));            }        }        return cc;    }    //登录微博    public String dologinSina(){        System.out.println("---do login, please hold on...---");        String url="http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)";//v1.3.17        Map<String,String> headers=new HashMap<String,String>();        Map<String,String> params=new HashMap<String,String>();                /*HTTP协议中的headers:http://www.cnblogs.com/yuzhongwusan/archive/2011/10/20/2218954.html         * */        headers.put("Accept", "text/html, application/xhtml+xml, */*");        headers.put("Referer", "http://login.sina.com.cn/member/my.php?entry=sso");        headers.put("Accept-Language", "zh-cn");        headers.put("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; BOIE9;ZHCN");        headers.put("Host", "login.sina.com.cn");        headers.put("Connection", "Keep-Alive");        headers.put("Content-Type", "application/x-www-form-urlencoded");        headers.put("Cache-Control", "no-cache");        params.put("encoding", "UTF-8");        params.put("entry", "weibo");        params.put("from", "");        params.put("prelt", "112");        params.put("gateway", "1");        params.put("nonce", nonce);        params.put("pwencode", "rsa2");//wsse        params.put("returntype", "META");        params.put("pagerefer", "");        params.put("savestate", "7");            params.put("servertime", servertime);        params.put("rsakv", rsakv);        params.put("service", "miniblog");        params.put("sp", getEncryptedP());        params.put("ssosimplelogin", "1");        params.put("su", getEncodedU());        params.put("url", "http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack");        params.put("useticket", "1");        params.put("vsnf", "1");        HttpResponse response=HttpUtils.doPost(url, headers, params);        this.cookies=HttpUtils.getResponseCookies(response);        this.headers=headers;        String responseText=HttpUtils.getStringFromResponse(response);        try {            responseText=new String(responseText.getBytes(),"GBK");            if(!responseText.contains("retcode=0")){                downloadCheckImage();                this.nonce=getnonce();                Scanner s=new Scanner(System.in);                if(responseText.contains("retcode=4049"))                    System.out.println("请输入验证码:");                else if(responseText.contains("retcode=2070")){                    System.out.println("验证码不正确,请再次输入验证码:");                }                this.door=s.next();                dologinSina();            }        } catch (UnsupportedEncodingException e) {            e.printStackTrace();        }        System.out.println("Congratulations, you have login success!");        return responseText;    }    //登录后重定向    public String redirect(){        String cookieValue=HttpUtils.setCookie2String(this.cookies);        this.headers.clear();        this.headers.put("Accept", "image/gif, image/jpeg, image/pjpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*");        this.headers.put("Accept-Language", "zh-cn");        this.headers.put("Connection", "Keep-Alive");        this.headers.put("Host", "sina.com.cn");        this.headers.put("Referer", "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)");        this.headers.put("User", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; QQDownload 691)");        this.headers.put("Cookie", cookieValue);        String ssosavestate=""; //SSO即Sina Sign-on,        String ticket = "";        for(Cookie c:this.cookies){            if(c.getName().equals("ALF")){                ssosavestate=c.getValue();            }else if(c.getName().equals("tgc")){                ticket=c.getValue();            }        }        String url="http://weibo.com/ajaxlogin.php?" +                "framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack&" +                "sudaref=weibo.com";        HttpResponse response=HttpUtils.doGet(url, this.headers);        response=HttpUtils.doGet(url, this.headers);            String responseText=HttpUtils.getStringFromResponse(response);        return responseText;    }    //生成一次性的字符串 6位 用于加密    private String getnonce() {        String x = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";        String str = "";        for (int i = 0; i < 6; i++) {            str += x.charAt((int)Math.ceil(Math.random() * 1000000) % x.length());        }        return str;    }    //初始化:得到服务区的时间servertime和一次性字符串nonce    private void init(){        String url=compositeUrl();        Map<String,String> headers=new HashMap<String,String>();        headers.put("Accept", "*/*");        headers.put("Referer", "http://weibo.com/");        headers.put("Accept-Language", "zh-cn");        headers.put("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; QQDownload 691)");        headers.put("Host", "login.sina.com.cn");        headers.put("Connection", "Keep-Alive");                HttpResponse response=HttpUtils.doGet(url, headers);        String responseText=HttpUtils.getStringFromResponse(response);        int begin=responseText.indexOf("{");        int end=responseText.lastIndexOf("}");        responseText=responseText.substring(begin,end+1);        PreLoginResponseMessage plrmsg =JsonUtils.jsontoPreLoginResponseMessage(responseText);        this.nonce=plrmsg.getNonce();        this.servertime=plrmsg.getServertime()+"";        this.pubkey=plrmsg.getPubkey();        this.rsakv=plrmsg.getRsakv();        this.pcid=plrmsg.getPcid();    }    //下载验证码    private void downloadCheckImage() {        if(pcid==null) return;        this.headers.remove("Content-Type");        try {            if(this.cookies != null){                this.cookies.clear();            }                    } catch (Exception e) {            e.printStackTrace();        }        String cookieValue=HttpUtils.setCookie2String(this.cookies);        this.headers.put("Cookie", cookieValue);        String url="http://login.sina.com.cn/cgi/pin.php?r="+(long)(Math.random()*100000000)+"&s=0&p="+this.pcid;        HttpResponse response=HttpUtils.doGet(url, headers);        InputStream in=HttpUtils.getInputStreamFromResponse(response);        try {            //System.out.println(new ClassPathResource("checkImage.jpeg").getFile().getPath());            Utils.writeFileFromStream(new ClassPathResource("checkImage.jpeg").getFile().getPath(), in);        } catch (IOException e) {            e.printStackTrace();        }    }    //组合预登陆时的URL    private String compositeUrl(){        StringBuilder builder=new StringBuilder();        builder.append("http://login.sina.com.cn/sso/prelogin.php?")           .append("entry=weibo&callback=sinaSSOController.preloginCallBack&")           .append("su="+getEncodedU())           .append("&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.5)&_="+System.currentTimeMillis());        return builder.toString();    }    //对用户名进行编码    private String getEncodedU() {        if(username!=null && username.length()>0){            return Base64Encoder.encode(EncodeUtils.encodeURL(username,"utf-8").getBytes());        }        return "";    }    //对密码进行编码    private String getEncryptedP(){//        return EncodeSuAndSp.getEncryptedP(password, servertime, nonce);        String data=servertime+"\t"+nonce+"\n"+password;        String spT=rsaCrypt(pubkey, "10001", data);        return spT;    }        public static String rsaCrypt(String pubkey, String exponentHex, String pwd,String servertime,String nonce) {          String data=servertime+"\t"+nonce+"\n"+pwd;          return rsaCrypt(pubkey,exponentHex,data);    }    public static String rsaCrypt(String pubkey, String exponentHex, String messageg) {            KeyFactory factory=null;            try {                factory = KeyFactory.getInstance("RSA");            } catch (NoSuchAlgorithmException e1) {                return "";            }            BigInteger publicExponent = new BigInteger(pubkey, 16); /* public exponent */            BigInteger modulus = new BigInteger(exponentHex, 16); /* modulus */            RSAPublicKeySpec spec = new RSAPublicKeySpec(publicExponent, modulus);            RSAPublicKey pub=null;            try {                pub = (RSAPublicKey) factory.generatePublic(spec);            } catch (InvalidKeySpecException e1) {                return "";            }            Cipher enc=null;            byte[] encryptedContentKey =null;            try {                enc = Cipher.getInstance("RSA");                enc.init(Cipher.ENCRYPT_MODE, pub);                encryptedContentKey = enc.doFinal(messageg.getBytes());            } catch (NoSuchAlgorithmException e1) {                System.out.println(e1.getMessage());                return "";            } catch (NoSuchPaddingException e1) {                System.out.println(e1.getMessage());                return "";            } catch (InvalidKeyException e1) {                System.out.println(e1.getMessage());                return "";            } catch (IllegalBlockSizeException e1) {                System.out.println(e1.getMessage());                return "";            } catch (BadPaddingException e1) {                System.out.println(e1.getMessage());                return "";            }             return new String(Hex.encodeHex(encryptedContentKey));    }    public void setUserid(String userid) {        this.userid = userid;    }    public String getUserid() {        return userid;    }    public void setUserdomainname(String userdomainname) {        this.userdomainname = userdomainname;    }    public String getUserdomainname() {        return userdomainname;    }}

Spider.sina类如下:

  1 package main.java.sina.httpclient;  2 import java.util.HashMap;  3 import java.util.List;  4 import java.util.Map;  5   6 import org.apache.http.HttpResponse;  7 import org.apache.http.cookie.Cookie;  8   9 import main.java.sina.utils.Constant; 10 import main.java.sina.utils.EncodeUtils; 11 import main.java.sina.utils.HttpUtils; 12 import main.java.sina.utils.Utils; 13  14 public class SpiderSina { 15     private LoginSina ls; 16     private Map<String,String> headers; 17     private final int  ADDFOLLOWING =1; 18     private final int  CANCELFOLLOWING =2; 19     public SpiderSina(LoginSina ls){ 20         this.ls=ls; 21         this.headers=new HashMap<String,String>(); 22         headers.put("Accept", "text/html, application/xhtml+xml, */*"); 23         headers.put("Accept-Language", "zh-cn"); 24         headers.put("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; BOIE9;ZHCN"); 25         headers.put("Connection", "Keep-Alive"); 26         headers.put("Cache-Control", "no-cache"); 27         String cookieValue=HttpUtils.setCookie2String(ls.getCookies()); 28         headers.put("Cookie", cookieValue); 29     } 36     public String getGroupCategory(){ 37         String url="http://q.weibo.com/"; 38         this.headers.put("Host", "q.weibo.com"); 39         HttpResponse response=HttpUtils.doGet(url, headers); 40         String responseText=HttpUtils.getStringFromResponse(response); 41         responseText=EncodeUtils.unicdoeToGB2312(responseText); 42         return responseText; 43     } 44     public String search(String keyword, int pageNo){ 47         String url="http://s.weibo.com/weibo/%25E5%25AE%2581%25E6%25B3%25A2%25E5%25A4%25A7%25E5%25AD%25A6&page="+pageNo; 48         String cookieValue = "SINAGLOBAL=8556698272004.724.1417744632425; un=shy_annan@126.com; myuid=5439352084; wvr=6; un=sm2014121904@126.com; _s_tentry=developer.51cto.com; SWB=usrmdinst_14; SUS=SID-5438576807-1419173757-GZ-lrze7-d8e1e3f082b428c12412c8ba30f0a6de; SUE=es%3D4cdfdd5d5f0f75141c092b32f89525a2%26ev%3Dv1%26es2%3D469e50c869315e57efeec3012c3bb6a8%26rs0%3DoWdG36CQ33LUEtKTvGn907Zy1mwFETvSVJsxeHEiaMPcKDB7pFxg596a2pLhFLJfQmswf4AvXYAkzTfemrYgWrz%252BQPustEA2wLNYufYpAZqFsGWanhTBq6elzB2yoZp41xcpy1WwXn1CuvzIzzEYpuILjHahkmJDQDQy6KaxlbA%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1419173757%26et%3D1419260157%26d%3Dc909%26i%3Da6de%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D27%26st%3D0%26uid%3D5438576807%26name%3Dsm2014121904%2540126.com%26nick%3DSocialMedia%25E5%259B%259B%25E5%25A8%2583%26fmp%3D%26lcp%3D; SUB=_2A255kq8tDeTxGeNK6FoU9yjEyzuIHXVa6DVlrDV8PUNbvtBeLW3TkW-bMoi0G_bBfpbS3TMqcXg6zDWFLA..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhGThsH46uNrx1VY0ApV0SR5JpX5KMt; ALF=1450709756; SSOLoginState=1419173757; WBStore=bc5ad8450c3f8a48|undefined; Apache=1027467835228.8901.1419173761694; ULV=1419173761704:6:6:1:1027467835228.8901.1419173761694:1418797827169; UOR=www.ilehao.com,widget.weibo.com,login.sina.com.cn; ULOGIN_IMG=14192385783486"; 49         headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); 50         //headers.put("Accept-Encoding", "gzip, deflate, sdch"); 51         headers.put("Accept-Language", "zh-CN"); 52         headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"); 53         headers.put("Connection", "Keep-Alive"); 54         headers.put("Cache-Control", "max-age=0"); 55         headers.put("Referer", "http://login.sina.com.cn/sso/login.php?url=http%3A%2F%2Fs.weibo.com%2Fweibo%2F%2525E6%252583%2525A0%2525E6%252599%2525AE%26page%3D2&_rand=1419173756.6387&gateway=1&service=weibo&entry=miniblog&useticket=1&returntype=META"); 56         headers.put("Cookie", cookieValue); 57         this.headers.put("Host", "s.weibo.com"); 58         HttpResponse response=HttpUtils.doGet(url, headers); 59         String responseText=HttpUtils.getStringFromResponse(response); 60         responseText=EncodeUtils.unicdoeToGB2312(responseText); 61          62          63         return responseText; 64     } 65      66     public String searchCommentsByUid(String uid){ 67          68         String url="http://www.weibo.com/u/"+uid; 69         String cookieValue = "SINAGLOBAL=8556698272004.724.1417744632425; myuid=2035860051; wvr=6; YF-Ugrow-G0=ad06784f6deda07eea88e095402e4243; SSOLoginState=1423150079; YF-V5-G0=32eb5467e9bfc8b60c2d771056535ac5; _s_tentry=www.weibo.com; Apache=6264929557219.147.1423150103832; ULV=1423150103842:18:2:2:6264929557219.147.1423150103832:1422769721265; ULOGIN_IMG=1423233797946; YF-Page-G0=82cdcdfb16327a659fbb60cc9368fb19; SUS=SID-2035860051-1423286223-GZ-jdkh4-c8ea11de0a42151313986e52f9aa6017; SUE=es%3D8701ff5aca59244ff1ff263cf985bee6%26ev%3Dv1%26es2%3D7995c9eb7455697c09fac4f7486e14eb%26rs0%3DTyXXIRjcEw%252BeS5PaVSM%252FhQjc2JGhKBOe3uFTgShiIUAbPFI2eKtrgxM2wIi9A1xndiTFFM72zY%252FDKYFXONrgkao5cRo%252FHkydV%252FnaQjNmXoeESu5gi6Iq0aX883NhGR0utBVNZb5XaIG3X6HMMfBJC%252B7pnVHogEo8eD6cx8nzN5c%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1423286223%26et%3D1423372623%26d%3Dc909%26i%3D6017%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D0%26st%3D0%26uid%3D2035860051%26name%3Dshy_annan%2540126.com%26nick%3D%25E7%2594%25A8%25E6%2588%25B72035860051%26fmp%3D%26lcp%3D2013-08-18%252021%253A48%253A10; SUB=_2A2550e-fDeTxGeRO6FcZ9i7Mzj2IHXVap0ZXrDV8PUNbvtBuLWnTkW-gBGVORTA7J_lSZzAqzW6E50JjBQ..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh7oKNCGYcNnhlC6eqqQbbl5JpX5KMt; SUHB=0M20OGRPiOKzyc; ALF=1454822222; UOR=www.ilehao.com,widget.weibo.com,login.sina.com.cn"; 70         headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); 71         headers.put("Accept-Language", "zh-CN"); 72         headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"); 73         headers.put("Connection", "Keep-Alive"); 74         headers.put("Cache-Control", "max-age=0"); 75         headers.put("Cookie", cookieValue); 76         this.headers.put("Host", "www.weibo.com"); 77         HttpResponse response=HttpUtils.doGet(url, headers); 78         String responseText=HttpUtils.getStringFromResponse(response); 79         responseText=EncodeUtils.unicdoeToGB2312(responseText); 82         return responseText; 83     }     85 //爬虫根据关键字,查询时间断,和查询页数  来得到htmlContent 86 public String search(String keyword, int pageNo, String fromdate,String todate){ 87     StringBuffer stringBuffer = new StringBuffer(200);    93     stringBuffer.append("http://s.weibo.com/weibo/"+ keyword +"&page="); 94     stringBuffer.append(pageNo); 95     stringBuffer.append("&typeall=1&suball=1&timescope=custom:"); 96     stringBuffer.append(fromdate); 97     stringBuffer.append(":"); 98     stringBuffer.append(todate); 99     stringBuffer.append("&Refer=g");104     String url = stringBuffer.toString();105     String cookieValue = headers.get("Cookie");106     headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");107     //headers.put("Accept-Encoding", "gzip, deflate, sdch");108     headers.put("Accept-Language", "zh-CN");109     headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");110     headers.put("Connection", "Keep-Alive");111     headers.put("Cache-Control", "max-age=0");112     headers.put("Referer", "http://s.weibo.com/weibo/%25E5%25AE%2581%25E6%25B3%25A2%25E5%25A4%25A7%25E5%25AD%25A6&typeall=1&suball=1&timescope=custom:"+fromdate+":"+todate+"&Refer=g");113     headers.put("Cookie", cookieValue);114     this.headers.put("Host", "s.weibo.com");115     HttpResponse response=HttpUtils.doGet(url, headers);116     String responseText=HttpUtils.getStringFromResponse(response);117     responseText=EncodeUtils.unicdoeToGB2312(responseText);118     119     System.out.println("************htmlContent start***********");120     System.out.println(responseText);121     System.out.println("************htmlContent end***********");125     return responseText;   127 } 129 public void forwardToWeiboPage(){130     String url = Constant.personalHomePage;131     headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");133     headers.put("Accept-Language", "zh-CN");134     headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");135     headers.put("Connection", "Keep-Alive");137     this.headers.put("Host", "s.weibo.com");138     HttpResponse response=HttpUtils.doGet(url, headers);139     String responseText=HttpUtils.getStringFromResponse(response);140     responseText=EncodeUtils.unicdoeToGB2312(responseText);141     List<Cookie> cookies = HttpUtils.getResponseCookies(response);142     String cookie = HttpUtils.setCookie2String(cookies);144     headers.put("Cookie", cookie);   146 }150     public String getGroupCategory(int id){151         String url="http://q.weibo.com/class/category/?id="+id;152         this.headers.put("Host", "q.weibo.com");154         HttpResponse response=HttpUtils.doGet(url, headers);155         String responseText=HttpUtils.getStringFromResponse(response);156         responseText=EncodeUtils.unicdoeToGB2312(responseText);157         return responseText;158     }169     //得到微群管理员ID信息,其实用户成员的第一页 HTML页面170     public String getGroupAdministrator(String groupid) {171         String url="http://q.weibo.com/"+groupid+"/members/all";172         this.headers.remove("Referer");173         this.headers.put("Host", "q.weibo.com");174         this.headers.remove("Content-Type");175         this.headers.remove("x-requested-with");176         HttpResponse response=HttpUtils.doGet(url, headers);177         String responseText=HttpUtils.getStringFromResponse(response);178         return responseText;179     }180     //根据微群号和页号得到群成员ID信息 -----JSON格式数据181     public String getGroupMembers(String groupid,int pagenumber){182         this.headers.put("Referer", "http://q.weibo.com/"+groupid+"/members/all");183         this.headers.put("Host", "q.weibo.com");184         this.headers.put("Content-Type", "application/x-www-form-urlencoded");185         this.headers.put("x-requested-with", "XMLHttpRequest"); 187         Map<String,String> params=new HashMap<String,String>();188         params.put("_t", "0");189         params.put("page", pagenumber+"");190         params.put("gid", groupid);191         params.put("query","");192         params.put("tab", "all");193         params.put("vip", "1");194         String url="http://q.weibo.com/ajax/members/page";195         HttpResponse response=HttpUtils.doPost(url, headers, params);196         return HttpUtils.getStringFromResponse(response);197     }198     /*199      *  得到微群中微博信息 经过多次尝试成功200      *  每次获得50个微博记录,page是页号, count值50 可以在1-75之间,但是,每次开始的时候还是从50的倍数开始的201      */202     public String getGroupTopic(int page,int count,String gid){203         this.headers.put("Referer", "http://q.weibo.com/"+gid);204         this.headers.put("Host", "q.weibo.com");205         this.headers.put("Content-Type", "application/x-www-form-urlencoded");206         this.headers.put("x-requested-with", "XMLHttpRequest");207         Integer pre_page=1;208         if(page==1){209             pre_page=2;210         }else{211             pre_page=page-1;212         }213         Map<String,String> params=new HashMap<String,String>();214         params.put("_k", System.currentTimeMillis()+"");215         params.put("_t", "0");216         params.put("count", count+"");217         //params.put("end_id", end_id);218         params.put("gid", gid);219         params.put("is_search","");220         params.put("key_word", "");221         params.put("me", "0");222         params.put("mids", "");223         params.put("new", "0");224         params.put("page", page+"");225         params.put("pagebar", "0");  226         params.put("pre_page", pre_page+"");227         params.put("since_id", "0");228         params.put("uid", "0");229         230         String url="http://q.weibo.com/ajax/mblog/groupfeed";231         HttpResponse response=HttpUtils.doPost(url, headers, params);232         return HttpUtils.getStringFromResponse(response);233     }234     /*235      *  得到微群中微博信息数目236      *  这个信息中其实还包含了微群的所有的基本信息~~~~~~~~~~****** json格式的数据信息237      */238     public String getGroupMessageNumber(String gid){239         this.headers.put("Referer", "http://q.weibo.com/"+gid);240         this.headers.put("Host", "q.weibo.com");241         this.headers.put("Content-Type", "application/x-www-form-urlencoded");242         this.headers.put("x-requested-with", "XMLHttpRequest");243         String url="http://q.weibo.com/ajax/rightnav/groupprofile?gid="+gid+"&_t=0&__rnd="+System.currentTimeMillis();244         HttpResponse response=HttpUtils.doGet(url, headers);245         return HttpUtils.getStringFromResponse(response);246     }247     //得到微群的主页信息  HTML页码   主要是为了得到第一条微博记录的MID值248     public String getgroupMainPage(String groupid) {249         String url="http://q.weibo.com/"+groupid+"?topnav=1";250         this.headers.remove("Referer");251         this.headers.put("Host", "q.weibo.com");252         this.headers.remove("Content-Type");253         this.headers.remove("x-requested-with");254         255         HttpResponse response=HttpUtils.doGet(url, headers);256         String responseText=HttpUtils.getStringFromResponse(response);257         return responseText;258     }259     /*260      * 根据分类得到微群信息261      * categroyID :分类ID号262      * pagenumber:页号263      * sort:分类方式 1 按成员人数 2按 微群博数 3按创建时间分类264      * count:每页的记录数目265      */266     public String getGroupByCategroy(int categroyID,int pagenumber,int sort,int count){267         this.headers.put("Referer", "http://q.weibo.com/class/category/?id="+categroyID);268         this.headers.put("Host", "q.weibo.com");269         this.headers.put("Content-Type", "application/x-www-form-urlencoded");270         this.headers.put("x-requested-with", "XMLHttpRequest");271         Map<String,String> params=new HashMap<String,String>();272         params.put("_t", "0");273         params.put("page", pagenumber+"");274         params.put("id", categroyID+"");275         params.put("sort",sort+"");276         params.put("count", count+"");277         278         String url="http://q.weibo.com/ajax/class/category";279         HttpResponse response=HttpUtils.doPost(url, headers,params);280         String responseText=HttpUtils.getStringFromResponse(response);281         responseText=EncodeUtils.unicdoeToGB2312(responseText);282         return responseText;283     }284     //得到表情列表信息285     public String getFaceList(){286         String url="http://weibo.com/aj/mblog/face?type=face&_t=0&__rnd="+System.currentTimeMillis();287         this.headers.put("Referer", "http://weibo.com/");288         this.headers.put("Host", "weibo.com");289         this.headers.put("Content-Type", "application/x-www-form-urlencoded");290         this.headers.put("x-requested-with", "XMLHttpRequest");291         292         HttpResponse response=HttpUtils.doGet(url, headers);293         String responseText=HttpUtils.getStringFromResponse(response);294         System.out.println(responseText);295         Utils.writeFileFromString("tmpFile/faceList.txt", responseText);296         return responseText;297     }307     //用户基本信息          主要是将要解析用户主页下方经过编码后的内容308     public String getMemberInfo(String memberID){309         String url="http://weibo.com/"+memberID+"/info";310         this.headers.put("Host", "weibo.com");311         this.headers.put("Referer", "http://weibo.com/u/"+memberID);312         HttpResponse response=HttpUtils.doGet(url, headers);313         String responseText=HttpUtils.getStringFromResponse(response);314         return responseText;315     }316     //用户粉丝用户信息    html页面,每次20个317     public String getMemberFans(String memberID,int page){318         String url="http://weibo.com/"+memberID+"/fans?&uid=1689219395&tag=&page="+page;319         this.headers.put("Host", "weibo.com");320         this.headers.put("Referer", "http://weibo.com/"+memberID+"/fans");321         HttpResponse response=HttpUtils.doGet(url, headers);322         String responseText=HttpUtils.getStringFromResponse(response);323         return responseText;324     }325     //用户关注的用户信息     html页面326     public String getMemberFollowing(String memberID,int page){327         String url="http://weibo.com/"+memberID+"/follow?page="+page;328         this.headers.put("Host", "weibo.com");329         this.headers.put("Referer", "http://weibo.com/"+memberID+"/follow");330         HttpResponse response=HttpUtils.doGet(url, headers);331         String responseText=HttpUtils.getStringFromResponse(response);332         return responseText;333     }334     335     /*336      *  @params 337      *   memberID:是用户ID338      *   max_id:每次AJAX获得数据时上面一次的最后一个ID值339      *   end_id:用户最新的一条微博的ID值340      *   k:一个随机数341      *   page:页号342      *   pre_page:前一页343      *   count:每次返回的数值  当max_id为null是 count=50 否则为15344      *      pagebar:ajax时,第一次为0,第二次为1345      *   注意:346      *   1  用此请求,每次获得的数据格式都一样,用同样的解析方法来进行解析。347      *   2 每次一页可以获得总共45条记录,需要三次请求。每次请求可获得15条记录。348      *   3 max_id可以不用到,直接等于 end_id就可以了.349      *   4 第一次请求时可以将end_id设置为NUll,即为第一次时翻页时的请求后边的滚动时必须有end_id参数,end_id为第一页的第一条ID即可。350      */351     //获得用户发布的微博信息   json格式的数据    352     public String getMemberReleaseTopic(String memberID,String end_id,Integer page,Integer pagebar){353         String url="";354         Integer pre_page=1;355         Integer count=0;356         String k=System.currentTimeMillis()+""+(int)(Math.random()*100000)%100;357         if(end_id==null){358             count=50;359             if(page==1){360                 pre_page=2;361             }else{362                 pre_page=page-1;363             }364             url="http://weibo.com/aj/mblog/mbloglist?" +365             "page="+page+"&count="+count+"&pre_page="+pre_page+"&" +366             "_k="+ k+"&uid="+memberID+367             "&_t=0&__rnd="+System.currentTimeMillis();368         }else{369             count=15;370             pre_page=page;371             url="http://weibo.com/aj/mblog/mbloglist?" +372             "page="+page+"&count="+count+"&max_id="+end_id+"&" +373             "pre_page="+pre_page+"&end_id="+end_id+"&" +374             "pagebar="+pagebar+"&_k="+k+"&" +375             "uid="+memberID+"&_t=0&__rnd="+System.currentTimeMillis();376         }377         String cookieValue = "SINAGLOBAL=8556698272004.724.1417744632425; un=sm2014121903@126.com; myuid=5439352084; YF-Ugrow-G0=4703aa1c27ac0c4bab8fc0fc5968141e; SSOLoginState=1421374583; wvr=6; YF-V5-G0=8c4aa275e8793f05bfb8641c780e617b; _s_tentry=login.sina.com.cn; Apache=2461283528245.9854.1421374588453; ULV=1421374588550:13:5:3:2461283528245.9854.1421374588453:1421210767499; UOR=www.ilehao.com,widget.weibo.com,login.sina.com.cn; SUS=SID-2035860051-1421462085-GZ-7jcgb-1539d643bae5195fb7f792b2ae77befb; SUE=es%3Df15e11ed09b6a0108a28adfa58609b78%26ev%3Dv1%26es2%3Da0f706efac5c89495062648a4de3e337%26rs0%3DZBxlOUv0mhmxyHfOVmZ3tH7tNvAp08BjPeLUJPdu9WzG38Dsm40px%252Bd9w21ycDpZQwBK3q0prFfNs%252F8ZuZSasa1eps%252FOGNxJ3CIHN8JN%252Fik6gVpIPgVeeRdalNWTIbth6hLa34uOp%252BXii%252Bxeib%252BvINsr%252FdOvQx6kjp6fsC44QXc%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1421462085%26et%3D1421548485%26d%3Dc909%26i%3Dbefb%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D2%26st%3D0%26uid%3D2035860051%26name%3Dshy_annan%2540126.com%26nick%3D%25E7%2594%25A8%25E6%2588%25B72035860051%26fmp%3D%26lcp%3D2013-08-18%252021%253A48%253A10; SUB=_2A255vboVDeTxGeRO6FcZ9i7Mzj2IHXVazdpdrDV8PUNbvtBuLVj-kW91jmbQSGo7Rn30RVvGP5KOgBgNgQ..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh7oKNCGYcNnhlC6eqqQbbl5JpX5KMt; ALF=1452998078; ULOGIN_IMG=14214638933178; YF-Page-G0=0acee381afd48776ab7a56bd67c2e7ac";378         headers.put("Cookie", cookieValue);379         this.headers.put("Referer", "http://weibo.com/u/"+memberID);380         this.headers.put("Host", "www.weibo.com");381         this.headers.put("Content-Type", "application/x-www-form-urlencoded");382         this.headers.put("x-requested-with", "XMLHttpRequest");383         url = "http://weibo.com/u/"+memberID;384         HttpResponse response=HttpUtils.doGet(url, headers);385         if(response==null){386             return "";387         }388         return HttpUtils.getStringFromResponse(response);389     }390     /*391      * ~~~~~~~~~~~~~~~~~~~~~获取用户的一些信息~~~end~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~392      */393     394     395     //**********************************************************************************396 397     /*398      *  名人堂与达人信息399      */400     public String getVerified(String url){401         this.headers.put("Host", "verified.weibo.com");402         this.headers.put("Referer", "http://plaza.weibo.com/?topnav=1&wvr=4");403         HttpResponse response=HttpUtils.doGet(url, headers);404         String responseText=HttpUtils.getStringFromResponse(response);405         return responseText;406     }407 408     public String getVerifiedMember(String path,Integer g_index){409         String url="http://verified.weibo.com/aj/getgrouplist?g_index="+g_index+410         "&path="+path+"&_t=0&__rnd="+System.currentTimeMillis();411         this.headers.put("Host", "verified.weibo.com");412         this.headers.put("Referer", path);413         this.headers.put("Content-Type", "application/x-www-form-urlencoded");414         this.headers.put("x-requested-with", "XMLHttpRequest");415         HttpResponse response=HttpUtils.doGet(url, headers);416         String responseText=HttpUtils.getStringFromResponse(response);417     418         return responseText;419     }420     421     public String setArea(Integer provinceID){422         this.headers.put("Referer", "http://club.weibo.com/list");423         this.headers.put("Host", "club.weibo.com");424         this.headers.put("Content-Type", "application/x-www-form-urlencoded");425         this.headers.put("x-requested-with", "XMLHttpRequest");426 427         Map<String,String> params=new HashMap<String,String>();428         429         params.put("_t", "0");430         params.put("city", "1000");431         params.put("prov", provinceID+"");432         433         String url="http://club.weibo.com/ajax_setArea.php";434         HttpResponse response=HttpUtils.doPost(url, headers, params);435         436         List<Cookie> cks=HttpUtils.getResponseCookies(response);437         List<Cookie> cookies=ls.getCookies();438         cookies.addAll(cks);439         String cookieValue=HttpUtils.setCookie2String(cookies);440         this.headers.put("Cookie", cookieValue);441         442         return HttpUtils.getStringFromResponse(response);443     }444     445     public String getDaRen(Integer page){446         String op="ltime"; 447         String url="http://club.weibo.com/list?sex=3&op="+op+"&page="+page+"&";448         Integer pre_page=(page<=1? 2:page-1);449         this.headers.put("Host", "club.weibo.com");450         this.headers.put("Referer", "http://club.weibo.com/list?sex=3&op=ltime&page="+pre_page+"&");451         this.headers.remove("Content-Type");452         this.headers.remove("x-requested-with");453         454         HttpResponse response=HttpUtils.doGet(url, headers);455         if(response!= null){456             return HttpUtils.getStringFromResponse(response);457         }458         return "";459         460     }470     //发布一条文字微博471     public String releaseTopic(String content){472         this.headers.put("Referer", "http://weibo.com/");473         this.headers.put("Host", "weibo.com");474         this.headers.put("Content-Type", "application/x-www-form-urlencoded");475         this.headers.put("x-requested-with", "XMLHttpRequest");476         Map<String,String> params=new HashMap<String,String>();477         params.put("_t", "0");478         params.put("location", "home");479         params.put("module", "stissue");480         params.put("pic_id", "");481         params.put("text", content);482         String url="http://weibo.com/aj/mblog/add?__rnd="+System.currentTimeMillis();483         HttpResponse response=HttpUtils.doPost(url, headers, params);484         return HttpUtils.getStringFromResponse(response);485     }519     //得到自己关注的成员520     public String getSelfFollowIngs(){521         return "";522     }523     //得到自己的粉丝524     public String getSelfFollowers(){525         return "";526     }527     //得到自己加入的微群528     public String getSelfJoinedGroups(){529         return "";530     }531     //得到自己的标签532     public String getSelfTags(){533         return "";534     }535     //得到自己发布的微博536     public String getSelfReleaseTopics(){537         return "";538     }539     //得到自己主页的微博540     public String getSelfPageTopics(){541         return "";542     }543     //关注一个人544     public String addFollowing(String memberid){545         return addorcancleFollowing(memberid,this.ADDFOLLOWING);546     }547     //取消关注一个人548     public String cancelFollowing(String memberid){549         return addorcancleFollowing(memberid,this.CANCELFOLLOWING);550     }551     private String addorcancleFollowing(String memberid,int option){552         String url="";553         switch(option){554             case ADDFOLLOWING:555                 url="http://weibo.com/aj/f/followed?__rnd="+System.currentTimeMillis();556                 break;557             case CANCELFOLLOWING:558                 url="http://weibo.com/aj/f/unfollow?__rnd="+System.currentTimeMillis();559                 break;560         }561         562         Map<String,String> params=new HashMap<String,String>();563 564         this.headers.put("Referer", "http://weibo.com/");565         this.headers.put("Host", "weibo.com");566         this.headers.put("Content-Type", "application/x-www-form-urlencoded");567         this.headers.put("Referer", "http://weibo.com/");568         this.headers.put("x-requested-with", "XMLHttpRequest");569         570         params.put("_t", "0");571         params.put("f", "1");572         params.put("location", "profile");573         params.put("refer_flag", "");574         params.put("refer_sort", "profile");575         params.put("uid", memberid);576         577         HttpResponse response=HttpUtils.doPost(url, headers, params);578         return HttpUtils.getStringFromResponse(response);579     }584     /**585      * 得到的标签信息  调用一次10个 586      * @return587      */588     public String getTags(){589         String url="http://account.weibo.com/set/aj/tagsuggest?__rnd="+System.currentTimeMillis();    590         this.headers.put("Referer", "http://account.weibo.com/set/tag#");591         this.headers.put("Host", "account.weibo.com");592         HttpResponse response=HttpUtils.doGet(url, headers);593         return HttpUtils.getStringFromResponse(response);594     }595     596     /**597      * 得到微博热词信息598      * @param k :热词的门类599      */600     public String getHotWords(String k){601         String url="http://data.weibo.com/top/keyword?k="+k;    602         try{603             Integer.parseInt(k);604         }catch(Exception ex){605             url="http://data.weibo.com/top/keyword?t="+k;    606         }607         this.headers.put("Referer", "http://data.weibo.com/top/keyword");608         this.headers.put("Host", "data.weibo.com");609         HttpResponse response=HttpUtils.doGet(url, headers);610         return HttpUtils.getStringFromResponse(response);611     }612     613     /**614      * 得到微博热帖子615      * @param cat  表示热帖门类616      * @param page 表示页号617      */618     public String getHotWeibo(String cat,int page){619         String url="http://data.weibo.com/hot/ajax/catfeed?page="+page+"&cat="+cat+"&_t=0&__rnd="+System.currentTimeMillis();    620         this.headers.put("Referer", "http://data.weibo.com/hot/minibloghot");621         this.headers.put("Host", "data.weibo.com");622         HttpResponse response=HttpUtils.doGet(url, headers);623         return HttpUtils.getStringFromResponse(response);624     }625     626     /**627      * 按照分类获取 微博吧名字  第一步628      */629     public String getWeiBar(String ctgid,int p){630         String sort="post";631         String url="http://weiba.weibo.com/aj_f/CategoryList?sort="+sort+"&p="+p+"&ctgid="+ctgid+"&_t=0&__rnd="+System.currentTimeMillis();    632         this.headers.put("Referer", "http://weiba.weibo.com/ct/"+ctgid);633         this.headers.put("Host", "weiba.weibo.com");634         this.headers.put("Accept", "*/*");635         this.headers.put("Content-Type", "application/x-www-form-urlencoded");636         this.headers.put("X-Requested-With", "XMLHttpRequest");637         HttpResponse response=HttpUtils.doGet(url, headers);638         return HttpUtils.getStringFromResponse(response);639     }640     /**641      * 根据微博吧 名称 ,得到该吧内的所有帖子标题 第二步642      */643     public String getWeiBarByWeibarName(String bid,int p){644         String url="http://weiba.weibo.com/aj_t/postlist?bid="+bid+"&p="+p+"&_t=all&__rnd="+System.currentTimeMillis();    645         this.headers.put("Referer", "http://weiba.weibo.com/");646         this.headers.put("Host", "weiba.weibo.com");647         this.headers.put("Accept", "*/*");648         this.headers.put("Content-Type", "application/x-www-form-urlencoded");649         this.headers.put("X-Requested-With", "XMLHttpRequest");650         HttpResponse response=HttpUtils.doGet(url, headers);651         return HttpUtils.getStringFromResponse(response);652     }653     654     /**655      * 新浪微公益名单656      * type ="donate"657      * type="discuss"658      */659     public String getWeiGongYiMember(int page,int projectID,String type){660         String url="http://gongyi.weibo.com/aj_personal_helpdata?page="+page+"&type="+type+"&project_id="+projectID+"&_t=0&__rnd="+System.currentTimeMillis();    661         this.headers.put("Referer", "http://gongyi.weibo.com/"+projectID);662         this.headers.put("Host", "gongyi.weibo.com");663         this.headers.put("Accept", "*/*");664         this.headers.put("Content-Type", "application/x-www-form-urlencoded");665         this.headers.put("X-Requested-With", "XMLHttpRequest");666         HttpResponse response=HttpUtils.doGet(url, headers);667         return HttpUtils.getStringFromResponse(response);668     }669 }

 

1楼ljy09
引用的包能发一下吗 很有用 正想学爬虫就看到这个了 谢谢拉
Re: RunforLove
@ljy09,QQ邮箱给我,我发你邮箱。。
  相关解决方案