import java.io.IOException
import java.io.InputStreamReader
import java.net.MalformedURLException
import java.net.URL
import java.util.ArrayList
import java.util.List
public class GetLinks {
private String webSource
private String url
public GetLinks(String url) throws MalformedURLException, IOException {
this.url = Complete(url)
webSource = getWebCon(this.url)
}
private String getWebCon(String strURL) throws MalformedURLException,
IOException {
StringBuffer sb = new StringBuffer()
java.net.URL url = new java.net.URL(strURL)
BufferedReader in = new BufferedReader(new InputStreamReader(url
.openStream()))
String line
while ((line = in.readLine()) != null) {
sb.append(line)
}
in.close()
return sb.toString()
}
private String Complete(String link)throws MalformedURLException{
URL url1 = new URL(link)
URL url2 = new URL(link+"/")
String handledUrl = link
try{
StringBuffer sb1 = new StringBuffer()
BufferedReader in1 = new BufferedReader(new InputStreamReader(url1
.openStream()))
String line1
while ((line1 = in1.readLine()) != null) {
sb1.append(line1)
}
in1.close()
StringBuffer sb2 = new StringBuffer()
BufferedReader in2 = new BufferedReader(new InputStreamReader(url2
.openStream()))
String line2
while ((line2 = in2.readLine()) != null) {
sb2.append(line2)
}
in1.close()
if(sb1.toString().equals(sb2.toString())){
handledUrl = link+"/"
}
}catch(Exception e){
handledUrl = link
}
return handledUrl
}
/**
* 处理链接的相对路径
* @param link 相对路径或绝对路径
* @return 绝对路径
*/
private String urlHandler(String link) {
if (link == null)
return null
link = link.trim()
if (link.toLowerCase().startsWith("http://")
|| link.toLowerCase().startsWith("https://")) {
return link
}
String pare = url.trim()
if (!link.startsWith("/")) {
if (pare.endsWith("/")) {
return pare + link
}
if (url.lastIndexOf("/") == url.indexOf("//") + 1 || url.lastIndexOf("/") == url.indexOf("//") + 2) {
return pare + "/" + link
} else {
int lastSeparatorIndex = url.lastIndexOf("/")
return url.substring(0, lastSeparatorIndex + 1) + link
}
}else{
if (url.lastIndexOf("/") == url.indexOf("//") + 1 || url.lastIndexOf("/") == url.indexOf("//") + 2) {
return pare + link
}else{
return url.substring(0,url.indexOf("/", url.indexOf("//")+3)) + link
}
}
}
public List<String>getAnchorTagUrls() {
if (webSource == null) {
System.out.println("没有网页源代码")
return null
}
ArrayList<String>list = new ArrayList<String>()
int index = 0
while (index != -1) {
index = webSource.toLowerCase().indexOf("<a ", index)
if (index != -1) {
int end = webSource.indexOf(">", index)
String str = webSource.substring(index, end == -1 ? webSource
.length() : end)
str = str.replaceAll("\\s*=\\s*", "=")
if (str.toLowerCase().matches("^<a.*href\\s*=\\s*[\'|\"]?.*")) {// "^<a\\s+\\w*\\s*href\\s*=\\s*[\'|\"]?.*"
int hrefIndex = str.toLowerCase().indexOf("href=")
int leadingQuotesIndex = -1
if ((leadingQuotesIndex = str.indexOf("\"", hrefIndex
+ "href=".length())) != -1) { // 形如<a
// href=".....">
int TrailingQuotesIndex = str.indexOf("\"",
leadingQuotesIndex + 1)
TrailingQuotesIndex = TrailingQuotesIndex == -1 ? str
.length() : TrailingQuotesIndex
str = str.substring(leadingQuotesIndex + 1,
TrailingQuotesIndex)
str = urlHandler(str)
list.add(str)
System.out.println(str)
index += "<a ".length()
continue
}
if ((leadingQuotesIndex = str.indexOf("\'", hrefIndex
+ "href=".length())) != -1) { // 形如<a
// href='.....'>
int TrailingQuotesIndex = str.indexOf("\'",
leadingQuotesIndex + 1)
TrailingQuotesIndex = TrailingQuotesIndex == -1 ? str
.length() : TrailingQuotesIndex
str = str.substring(leadingQuotesIndex + 1,
TrailingQuotesIndex)
str = urlHandler(str)
System.out.println(str)
list.add(str)
index += "<a ".length()
continue
}
int whitespaceIndex = str.indexOf(" ", hrefIndex
+ "href=".length())// 形如<a href=
// http://www.baidu.com >
whitespaceIndex = whitespaceIndex == -1 ? str.length()
: whitespaceIndex
str = str.substring(hrefIndex + "href=".length(),
whitespaceIndex)
str = urlHandler(str)
list.add(str)
System.out.println(str)
}
index += "<a ".length()
}
}
return list
}
public static void main(String[] args) throws Exception {
GetLinks gl = new GetLinks("http://www.baidu.com")
List<String>list = gl.getAnchorTagUrls()
for(String str:list) {
System.out.println(str)
}
}
}
什么叫自动获取?如果想用java访问 http连接的话
1.创建连接:
URL url = new URL("http://www.baidu.com")
2.打开连接,获取连接输入流。
InputStream in = url.openConnection().getInputStream()
3.解析流。
System.out.println(IOUtils.toString(in))//输出访问地址内容。。
方法1:正则 (http://)或者(https://)开头往后面匹配三个点,不会的话百度一波。然后把最后的点去掉就可以得到域名方法2:将URL字符串转换为charArray遍历 对.(点)的次数进行记数第三次当前返回下标用SubString切割字符串获取域名