public class FetchElement implements Comparable<FetchElement>{
private String content;
private String href;
public double pr;
public FetchElement(){
super();
}
//get,set方法省略
public FetchElement(String content,String href){
this.content=content;
this.href=href;
this.pr=0.0;
}
@Override
public boolean equals(Object obj) {
if(!(obj instanceof FetchElement))
return false;
FetchElement fe2=(FetchElement)obj;
if(this.getHref().hashCode()==fe2.getHref().hashCode()){
return true;
}
int len=this.href.length()-fe2.href.length();
boolean b=this.getHref().contains(fe2.getHref())?true:fe2.getHref().contains(this.getHref());
if(b&&Math.abs(len)<=1)
return true;
if(this.getContent().hashCode()==fe2.getContent().hashCode())
return true;
return false;
}
@Override
public int compareTo(FetchElement fe2) {
if(this.getHref().hashCode()==fe2.getHref().hashCode()){
return 0;
}
int len=this.href.length()-fe2.href.length();
boolean b=this.getHref().contains(fe2.getHref())?true:fe2.getHref().contains(this.getHref());
if(b&&Math.abs(len)<=1)
return 0;
if(this.getContent().hashCode()==fe2.getContent().hashCode())
return 0;
if(this.getPr()>fe2.getPr())
return -1;
if(this.getPr()<fe2.getPr())
return 1;
return 1;
}
}
public class PageRank {
private static TreeSet<FetchElement> srcset=new TreeSet<FetchElement>();
public static void initSet(String url){
try{
Document doc=Jsoup.connect(url).timeout(3000).ignoreContentType(true).get();
Elements links = doc.select("a[href]");
if(links==null)
return;
for(Element link:links){
String href=dealUrl(link.attr("abs:href"));
if(!obeyFetchRule(href))
continue;
String content=link.text();
FetchElement fe=new FetchElement(content, href);
if(!srcset.contains(fe)){
srcset.add(fe);
System.out.println(href);
}
}
}catch(Exception e){
e.printStackTrace();
}
}
public static String dealUrl(String url){
int limit=url.indexOf("#");
if(limit>0)
return url.substring(0,limit);
else
return url;
}
}
按照道理来说,TreeSet中的元素不应该重复呀,但是我打印出FetchElement中的href,有很多重复数据
http://www.pc841.com
http://www.pc841.com/shouji/
http://www.pc841.com/zuzhuang/
http://www.pc841.com/notebook/
http://tu.pc841.com
http://www.pc841.com/Window7/
http://www.pc841.com/weixiu/
http://www.pc841.com/zhishi/
http://www.pc841.com/wenda/
http://www.pc841.com/jiqiao/
http://www.pc841.com
http://www.pc841.com/notebook/
http://www.pc841.com/pad/
http://www.pc841.com/youxi/
http://www.pc841.com/luyouqi/
http://www.pc841.com/xinwen/
http://www.pc841.com/shouji/
为什么呢?真心不懂为什么
------解决方案--------------------
打断点到compareTo方法上面调试一下