Web Crawler
I want to develop an offline browser.But my eyesight is kind of blurred and am unable to decide which path to follow.So I decided to go step by step ,starting with the simplest code I can write and eventually I will reach my destination.
So first of all ,I am going to write a code that can extract all the hyperlinks present in a html file.For that I need to input an URL from which I will get the html code , detect all the anchors present in the code ,extract the links and display them as output.Finally I come up with the following code:-
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
/**
*
* @author kiit
*/
public class Main {
/**
* @param args the command line arguments
*/
public static void main(String[] args) {
// TODO code application logic here
try {
URL my_url = new URL("http://ruchipurohit.blogspot.com/");
BufferedReader br = new BufferedReader(new InputStreamReader(my_url.openStream()));
String strTemp = "";
String finalstr="";
while(null != (strTemp = br.readLine())){
finalstr=finalstr+strTemp;
// System.out.println(strTemp);
}
{ try
{
java.io.Reader reader =
new java.io.StringReader(finalstr);
final javax.swing.text.html.parser.ParserDelegator parserDelegator =
new javax.swing.text.html.parser.ParserDelegator();
final javax.swing.text.html.HTMLEditorKit.ParserCallback
parserCallback =
new javax.swing.text.html.HTMLEditorKit.ParserCallback()
{ public void handleText( final char[] data, final int pos ){}
public void handleStartTag
( final javax.swing.text.html.HTML.Tag tag,
final javax.swing.text.MutableAttributeSet attribute,
final int pos )
{ if( tag == javax.swing.text.html.HTML.Tag.A )
{ final java.lang.String address =( java.lang.String )
attribute.getAttribute
( javax.swing.text.html.HTML.Attribute.HREF );
java.lang.System.out.println( address ); }}
public void handleEndTag
( final javax.swing.text.html.HTML.Tag t, final int pos ){}
public void handleSimplTag
( final javax.swing.text.html.HTML.Tag t,
final javax.swing.text.MutableAttributeSet a, final int pos ){}
public void handleComment
( final char[] data, final int pos ){}
public void handleError
( final java.lang.String errMsg, final int pos ){} };
parserDelegator.parse( reader, parserCallback, false );
java.lang.System.out.println(); }
catch( final java.io.IOException iOException )
{ java.lang.System.err.println( iOException ); }}
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
But problem with this code is it doesn't work for some URLs.For some URLs it's throwing an exception---
javax.swing.text.ChangedCharSetException
I am new to this face of java and trying to learn more and more about it.I'll be glad if someone help me with it.
I m a great idiot.To ignore the characterset I just have to change the third parameter of parse( ) to True.The signature of the parse( ) method is as follows-
public void parse(Reader in,
HTMLEditorKit.ParserCallback callback, boolean ignoreCharSet);
Now I will try to separate links to image file from the rest of the links and i will try to make html files for all the URLs retrieved.I will update my code as soon as i'll get it done.I am facing problem in copying image from a specified URL to a file.Following is my code:-
import java.awt.*;
import java.io.*;
import java.net.URL;
import java.awt.event.*;
import java.awt.image.BufferedImage;
import javax.imageio.ImageIO;
public class HtmlFile {
public static void main(String[] args) {
// TODO code application logic here
try {
String urlString= "https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjjiKL2zJyUzfFDOZhsWsbAXYYiU4CI8HQRZNDa94KgwhHg6Jg7VGqb0N2ueVo4pZDmLIio_gEnS7VIcGofASVjXX0cICyAbitcOft5ikLlAHXJndkGU4cJQ7JGhMLubxZbJtN-z10QhsAj/s1600/Screenshot-Test+OS-3.png";
//"http://forums.sun.com/thread.jspa?threadID=5390245"; //"https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjjiKL2zJyUzfFDOZhsWsbAXYYiU4CI8HQRZNDa94KgwhHg6Jg7VGqb0N2ueVo4pZDmLIio_gEnS7VIcGofASVjXX0cICyAbitcOft5ikLlAHXJndkGU4cJQ7JGhMLubxZbJtN-z10QhsAj/s1600/Screenshot-Test+OS-3.png";
//https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgP2cGSEb4owqf8e6fK_Yq1t9vQgknSAWqFaWY2bauiYWI7Btz8uvb3mARp1aHEXMtDuAyLSlhf76IASw9D2EeCvkGiUD8QRAu5hKuv1xpO0hywuor3DQU4gLwjeVdOzLR4C0f7wXexFX08/s1600-h/1.JPG");
URL my_url = new URL(urlString);//"http://ruchipurohit.blogspot.com/");//"http://www.vimalkumarpatel.blogspot.com/");
int length=urlString.length();
System.out.println(urlString.substring(length-3));
if(urlString.substring(length-3).compareTo("jpg")==0){
Image image = Toolkit.getDefaultToolkit().getImage(my_url);
BufferedImage img = new BufferedImage(image.getWidth(null),image.getHeight(null),BufferedImage.TYPE_INT_RGB);
Graphics2D g2 = img.createGraphics();
// Draw img into bi so we can write it to file.
g2.drawImage(image, 0, 0, null);
g2.dispose();
File outputfile = new File("saved.jpg");
ImageIO.write(img, "jpg", outputfile);
}
else if (urlString.substring(length-3).compareTo("png")==0){
Image image = Toolkit.getDefaultToolkit().getImage(my_url);
BufferedImage img = new BufferedImage(image.getWidth(null),image.getHeight(null),BufferedImage.TYPE_INT_RGB);
Graphics2D g2 = img.createGraphics();
// Draw img into bi so we can write it to file.
g2.drawImage(image, 0, 0, null);
g2.dispose();
File outputfile = new File("saved.png");
ImageIO.write(img, "png", outputfile);
}
else if(urlString.substring(length-3).compareTo("gif")==0){
Image image = Toolkit.getDefaultToolkit().getImage(my_url);
BufferedImage img = new BufferedImage(image.getWidth(null),image.getHeight(null),BufferedImage.TYPE_INT_RGB);
Graphics2D g2 = img.createGraphics();
// Draw img into bi so we can write it to file.
g2.drawImage(image, 0, 0, null);
g2.dispose();
File outputfile = new File("saved.gif");
ImageIO.write(img, "gif", outputfile);
}
else
{
BufferedReader br = new BufferedReader(new InputStreamReader(my_url.openStream()));
String strTemp = "";
String finalstr="";
while(null != (strTemp = br.readLine())){
finalstr=finalstr+strTemp;
// System.out.println(strTemp);
}
File f=new File("browser.html");
FileWriter fw=new FileWriter(f);
fw.write(finalstr);
}
}catch (Exception ex) {
ex.printStackTrace();
}
}
}
Above code is loading webpages from a specified URL on my system folder but without images so i need to copy all the images present in the webpage to my folder.But the code which i have written is unable to extract image from a URL.Its giving following error-
java.lang.IllegalArgumentException: Width (-1) and height (-1) cannot be <= 0
at java.awt.image.DirectColorModel.createCompatibleWritableRaster(DirectColorModel.java:999)
at java.awt.image.BufferedImage.(BufferedImage.java:314)
at crawler.HtmlFile.main(HtmlFile.java:50)
When I explicitly passed width and height to the constructor of BufferedImage class then its running without any error and only creating a image file without displaying any image in it. I am searching for the solution in net.If anyone can help me ,I'll b glad.
import java.awt.*;
import java.io.*;
import java.net.URL;
import java.awt.event.*;
import java.awt.image.BufferedImage;
import javax.imageio.ImageIO;
public class HtmlFile {
public static void main(String[] args) {
// TODO code application logic here
try {
String urlString= "https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjjiKL2zJyUzfFDOZhsWsbAXYYiU4CI8HQRZNDa94KgwhHg6Jg7VGqb0N2ueVo4pZDmLIio_gEnS7VIcGofASVjXX0cICyAbitcOft5ikLlAHXJndkGU4cJQ7JGhMLubxZbJtN-z10QhsAj/s1600/Screenshot-Test+OS-3.png";
//"http://forums.sun.com/thread.jspa?threadID=5390245"; //"https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjjiKL2zJyUzfFDOZhsWsbAXYYiU4CI8HQRZNDa94KgwhHg6Jg7VGqb0N2ueVo4pZDmLIio_gEnS7VIcGofASVjXX0cICyAbitcOft5ikLlAHXJndkGU4cJQ7JGhMLubxZbJtN-z10QhsAj/s1600/Screenshot-Test+OS-3.png";
//https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgP2cGSEb4owqf8e6fK_Yq1t9vQgknSAWqFaWY2bauiYWI7Btz8uvb3mARp1aHEXMtDuAyLSlhf76IASw9D2EeCvkGiUD8QRAu5hKuv1xpO0hywuor3DQU4gLwjeVdOzLR4C0f7wXexFX08/s1600-h/1.JPG");
URL my_url = new URL(urlString);//"http://ruchipurohit.blogspot.com/");//"http://www.vimalkumarpatel.blogspot.com/");
int length=urlString.length();
System.out.println(urlString.substring(length-3));
if(urlString.substring(length-3).compareTo("jpg")==0){
Image image = Toolkit.getDefaultToolkit().getImage(my_url);
BufferedImage img = new BufferedImage(image.getWidth(null),image.getHeight(null),BufferedImage.TYPE_INT_RGB);
Graphics2D g2 = img.createGraphics();
// Draw img into bi so we can write it to file.
g2.drawImage(image, 0, 0, null);
g2.dispose();
File outputfile = new File("saved.jpg");
ImageIO.write(img, "jpg", outputfile);
}
else if (urlString.substring(length-3).compareTo("png")==0){
Image image = Toolkit.getDefaultToolkit().getImage(my_url);
BufferedImage img = new BufferedImage(image.getWidth(null),image.getHeight(null),BufferedImage.TYPE_INT_RGB);
Graphics2D g2 = img.createGraphics();
// Draw img into bi so we can write it to file.
g2.drawImage(image, 0, 0, null);
g2.dispose();
File outputfile = new File("saved.png");
ImageIO.write(img, "png", outputfile);
}
else if(urlString.substring(length-3).compareTo("gif")==0){
Image image = Toolkit.getDefaultToolkit().getImage(my_url);
BufferedImage img = new BufferedImage(image.getWidth(null),image.getHeight(null),BufferedImage.TYPE_INT_RGB);
Graphics2D g2 = img.createGraphics();
// Draw img into bi so we can write it to file.
g2.drawImage(image, 0, 0, null);
g2.dispose();
File outputfile = new File("saved.gif");
ImageIO.write(img, "gif", outputfile);
}
else
{
BufferedReader br = new BufferedReader(new InputStreamReader(my_url.openStream()));
String strTemp = "";
String finalstr="";
while(null != (strTemp = br.readLine())){
finalstr=finalstr+strTemp;
// System.out.println(strTemp);
}
File f=new File("browser.html");
FileWriter fw=new FileWriter(f);
fw.write(finalstr);
}
}catch (Exception ex) {
ex.printStackTrace();
}
}
}
Above code is loading webpages from a specified URL on my system folder but without images so i need to copy all the images present in the webpage to my folder.But the code which i have written is unable to extract image from a URL.Its giving following error-
java.lang.IllegalArgumentException: Width (-1) and height (-1) cannot be <= 0
at java.awt.image.DirectColorModel.createCompatibleWritableRaster(DirectColorModel.java:999)
at java.awt.image.BufferedImage.
