获取百度新闻中所有的中国新闻的标题时间来源
1 获取网页 2 public static String getContent(String str) throws ClientProtocolException, IOException { 3 CloseableHttpClient closeableHttpClient=HttpClients.createDefault(); //创建实例 4 HttpGet httpGet=new HttpGet(str); 5 CloseableHttpResponse closeableHttpResponse=closeableHttpClient.execute(httpGet); //执行--返回 6 HttpEntity httpEntity=closeableHttpResponse.getEntity(); //获取实体 7 String content=EntityUtils.toString(httpEntity, "utf-8"); 8 closeableHttpResponse.close(); 9 closeableHttpClient.close();10 return content;11 }12 ======= ====== ======= ========13 筛选所有符合要求的链接14 public static ArrayListgetUrl(String str,String strr) {15 Document doc=Jsoup.parse(str);16 Elements elements =doc.select("a[href]"); //获取a标签17 ArrayList strs=new ArrayList ();18 for(Element e:elements) {19 String urls=e.attr("abs:href");20 if(urls.startsWith(strr)) {21 strs.add(urls);22 }23 }24 return strs;25 }26
测试解析
public class BaiduDemo { public static void main(String[] args) { // TODO Auto-generated method stub String str="http://news.baidu.com"; try { String content=GetUtil.getContent(str); ArrayListlist=GetUtil.getUrl(content, "https://kandian.youth.cn/"); for(String s:list) { System.out.println(s); String strr=GetUtil.getContent(s); Document doc=Jsoup.parse(strr); Elements ele1=doc.select("div[class=J-title_detail title_detail] h1"); Elements ele=doc.select("div[class=J-title_detail title_detail]"); if(ele!=null) { System.out.println("标题: "+ele1.text()); Elements eles=ele.select("div[class=fl] i"); System.out.println("发帖时间: "+eles.text()); Elements eless=ele.select("div[class=fl] a"); System.out.println("发帖来源: "+eless.text()); } } } catch (ClientProtocolException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }}
另一方式获取
public static void main(String[] args) { // TODO Auto-generated method stub try { String str=GetUtil.getContent("http://sports.163.com/18/0207/09/DA1HPMLI00058781.html"); //System.out.println(str); Document doc=Jsoup.parse(str); Element element=doc.getElementById("epContentLeft"); Elements elements=element.getElementsByTag("h1"); System.out.println("标题: "+elements.text()); Elements elementss=doc.getElementsByClass("post_time_source"); System.out.println("发帖时间: "+elementss.text().substring(0,19)); element=doc.getElementById("endText"); System.out.println("正文:"); System.out.println(element.text()); elementss=doc.getElementsByClass("ep-source cDGray"); System.out.println(elementss.text()); //抓取评论 elementss=doc.getElementsByClass("tie-cnt"); //tie-cnt System.out.println("跟帖 :"+elementss.text()); } catch (ClientProtocolException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }}