导航菜单
首页 >  Jsoup解析HTML 标签内容  > jsoup爬虫发送get、post请求、解析html、获取json

jsoup爬虫发送get、post请求、解析html、获取json

@[TOC]

1 简介

jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据

依赖

org.jsoup jsoup 1.8.3

官网地址:https://jsoup.org/apidocs/org/jsoup/nodes/Element.html

2 发送get请求 2.1设置请求头和cookiepublic static void main(String[] args) throws IOException {doGet("",""); } public static void doGet(String url ,String cookie) throws IOException {url = "https://www.baidu.com/"; cookie = "BIDUPSID=56C4FFBF72723876ACBD5B2EA2C75AD8; PSTM=1669362017; BD_UPN=12314753; BAIDUID=F4A7FCF34D70EE033317D31778290FE0:FG=1; ZFY=bMkQzdDwo03XIz75dDC8:AwpdCLhCOaBeWmYRxguNzHQ:C; BAIDUID_BFESS=F4A7FCF34D70EE033317D31778290FE0:FG=1; baikeVisitId=a61698de-af11-451a-8f37-7a0efc41ef7c; __bid_n=184d169de2c1b652e44207; RT=\"z=1&dm=baidu.com&si=7ao2cg4efas&ss=lb63h3q0&sl=3&tt=2hm&bcn=https://fclog.baidu.com/log/weirwood?type=perf&ld=31p&ul=18ia&hd=18ka\"; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; COOKIE_SESSION=439025_2_8_9_12_10_0_0_8_6_0_0_459518_56535_3_0_1670314644_1669853798_1670314641|9#56556_3_1669853666|2; BA_HECTOR=24a484ah0k84258h0h852gj41hovnrf1g; ab_sr=1.0.1_ODA0NTY3OGYzMmViNzMyNGQ1YWU1MzhjZDc1YjM2ZjQ0MjlkZjlhOTI2ZjZjMjk1N2NlODAzNjg5YjRlNDQxNjUzZjQ2M2JkZDYwOWQ2Y2Q1MDI4NzRhNjRkM2RjYmQxOGNiNjZhMTk0YjFhZThmMmE4ZjljN2MyMzJmOWFhOWNlMTk3MmZjNTcyNjNkN2RhYjQ3M2Y1MjNlMjViNjcyOQ==; BDRCVFR[PGnakqNNAQT]=mk3SLVN4HKm; BD_HOME=1; H_PS_PSSID=37856_36560_36920_37835_37841_37871_37765_37797_37760_37853_26350_22160_37881; delPer=0; BD_CK_SAM=1; PSINO=1; H_PS_645EC=35d9T+b4lE+jHT8VNaqK1aghA5CXp4Yf4fqDSH/hRVMSETsPPBQsOh069iA"; //获取请求连接 Connection con = Jsoup.connect(url); //请求头设置,特别是cookie设置 con.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"); con.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"); con.header("Host","www.baidu.com"); con.header("Referer","https://www.baidu.com/link?url=Su25Siv6Fi0gZJgDQoDcxse30NxOOLSSqQcNy0k6FU3&wd=&eqid=d8ed532b000af73400000003638feddc"); con.header("Cookie", cookie); //解析请求结果 Document doc = con.get(); //获取标题 System.out.println(doc.title()); //输出网页//System.out.println(doc.toString()); } 2.2 设置请求参数//获取请求连接 Connection conn = Jsoup.connect("http://www.cnblogs.com/zhangfei/p/"); //请求参数设置 conn.data("page","3"); //获取请求结果 Document doc = conn.get(); 2.3 获取登录后的cookie、response public static String doGetResponse(String url,String cook,String header) throws IOException{//获取请求连接 Connection con = Jsoup.connect(url); //请求头设置,特别是cookie设置 con.header("Accept", "text/html, application/xhtml+xml, */*"); con.header("Content-Type", "application/x-www-form-urlencoded"); con.header("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0))"); con.header("Cookie", cook); //发送请求 Response resp=con.method(Method.GET).execute(); //获取cookie名称为__bsi的值 String cookieValue = resp.cookie("__bsi"); System.out.println("cookie __bsi值: "+cookieValue); //获取返回cookie所值 Map cookies = resp.cookies(); System.out.println("所有cookie值: "+cookies); //获取返回头文件值 String headerValue = resp.header(header); System.out.println("头文件"+header+"的值:"+headerValue); //获取所有头文件值 Map headersOne =resp.headers(); System.out.println("所有头文件值:"+headersOne); return headerValue;} 3 发送post请求 3.1 发送post请求返回body public static String doPost(String url,Map map,String cookie) throws IOException{//获取请求连接 Connection con = Jsoup.connect(url); //遍历生成参数 if(map!=null){for (Entry entry : map.entrySet()) {//添加参数 con.data(entry.getKey(), entry.getValue()); } } //插入cookie(头文件形式) con.header("Cookie", cookie); Document doc = con.post(); System.out.println(doc); return doc.toString(); } 3.2 发送post请求获取cookie、response //发送请求 Response resp=con.method(Method.POST).execute(); //获取cookie名称为__bsi的值 String cookieValue = resp.cookie(header); System.out.println(cookieValue); 3.3 发送json参数,application/json;//获取请求连接 Connection con = Jsoup.connect(url); //这里用了阿里的fastjson2 JSONObject jsonObject = new JSONObject(); jsonObject.put("name","迪迦"); con.requestBody(jsonObject.toString()); //解析请求结果,这里忽略contentType检查,不然会默认为application/x-www-form-urlencoded Connection.Response response = con.ignoreContentType(true).method(Connection.Method.POST).execute(); 4 Document的使用 4.1 使用DOM的方式来取得getElementById(String id)://通过id来获取getElementsByTag(String tagName)://通过标签名字来获取getElementsByClass(String className)://通过类名来获取getElementsByAttribute(String key)://通过属性名字来获取getElementsByAttributeValue(String key, String value)://通过指定的属性名字,属性值来获取getAllElements()://获取所有元素 4.2 通过类似于css或jQuery的选择器来查找元素

```javapublic Elements select(String cssQuery)

//举例Elements links = doc.select("a[href]"); //带有href属性的a元素Elements pngs = doc.select("img[src$=.png]");

## 4.3 从Element对象中,取出我们真正需要的内容```javaElement.text()//这个方法用来取得一个元素中的文本。Element.html()或Node.outerHtml()//这个方法用来取得一个元素中的html内容Node.attr(String key)//获得一个属性的值,例如取得超

相关推荐: