您的位置:首页 > 编程语言 > Java开发

JAVA写的网页爬虫爬取必应词典的词汇音标

2016-07-21 15:41 691 查看
为了建立一个英语词汇音标库,需要从网上搜索词汇音标,由于有10万多个词汇需要处理,所以做个这个单词音标爬虫爬取必应网络词典网页上的单词音标。

package bingword;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javafx.application.Application;
import javafx.beans.value.ChangeListener;
import javafx.beans.value.ObservableValue;
import javafx.concurrent.Worker;
import javafx.event.ActionEvent;
import javafx.event.EventHandler;
import javafx.geometry.Insets;
import javafx.scene.Scene;
import javafx.scene.control.Button;
import javafx.scene.control.Label;
import javafx.scene.control.TextArea;
import javafx.scene.control.TextField;
import javafx.scene.control.Tooltip;
import javafx.scene.layout.GridPane;
import javafx.scene.text.Font;
import javafx.scene.web.WebEngine;
import javafx.scene.web.WebView;
import javafx.stage.FileChooser;
import javafx.stage.Stage;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;

public class BingWord extends Application {

private final GridPane grid = new GridPane();
private final TextField inputText = new TextField();
private final Button btn = new Button();
private final Button btn1 = new Button();
private final Button btn2 = new Button();
private final Button btnPause = new Button();
private final WebView browser = new WebView();
private final WebEngine webEngine = browser.getEngine();
private final TextArea textArea = new TextArea();
private final Label info = new Label();
private int recordPosition = 1;
private String currentWord = "";
private Connection connection = null;
private Statement statement;
private ResultSet rs;
private int maxNumber = 1;
private Boolean pause = false;

@Override
public void start(Stage primaryStage) throws SQLException {

//程序启动后就连接好SQLite数据库,或者新建一个dictionary.db数据库文件
connection = DriverManager.getConnection("jdbc:sqlite:dictionary.db");
statement = connection.createStatement();
statement.setQueryTimeout(30);
statement.executeUpdate("CREATE TABLE IF NOT EXISTS wordlist (id INTEGER PRIMARY KEY AUTOINCREMENT, word TEXT, pronunciation1 TEXT, pronunciation2 TEXT)");
System.out.println("dictionary.db数据库已经连接");

//文本框供手动设置开始行位置,默认值为第一行开始
inputText.setText("1");

//查找必应词典音标的启动按钮
Tooltip tooltip=new Tooltip();
tooltip.setText("请在右侧输入框内填入开始查找的行号");
tooltip.setFont(new Font("Arial", 20));

btn.setTooltip(tooltip);
btn.setText("开始查找音标行");
btn.setOnAction(new EventHandler<ActionEvent>() {
@Override
public void handle(ActionEvent event) {
//重启
pause = false;
//获取文本框中设定的起始行位置
recordPosition = Integer.parseInt(inputText.getText());
try {
//查询记录总条数
rs = statement.executeQuery("SELECT count(*) FROM wordlist;");
while (rs.next()) {
maxNumber = rs.getInt(1);
System.out.println("数据库中总单词量:" + maxNumber);
}
//单词音标搜索程序
search();
} catch (SQLException ex) {
Logger.getLogger(BingWord.class.getName()).log(Level.SEVERE, null, ex);
}

}
});

//将EXCEL导入到SQLITE数据库中
Tooltip tooltip1=new Tooltip();
tooltip1.setText("将EXCEL表中第一列单词导入到SQLite数据库中");
tooltip1.setFont(new Font("Arial", 20));

btn1.setTooltip(tooltip1);
btn1.setText("Excel -> SQLite");
btn1.setOnAction(new EventHandler<ActionEvent>() {
@Override
public void handle(ActionEvent event) {
System.out.println("将Excel中的单词导入dictionary.db数据库的wordlist表中");
FileChooser fileChooser = new FileChooser();
fileChooser.setTitle("打开EXCEL文件");
fileChooser.getExtensionFilters().add(new FileChooser.ExtensionFilter("EXCEL", "*.xls"));
File file = fileChooser.showOpenDialog(primaryStage);
if (file == null) {
return;
}
String filePath = file.getPath();
System.out.println(filePath);
try {
//导入EXCEL文件
FileInputStream inputExcel = new FileInputStream(filePath);
HSSFWorkbook wb = new HSSFWorkbook(inputExcel);
//获取SHEET的张数
int sheetCount = wb.getNumberOfSheets();
System.out.println("导入的EXCEL表总共有工作表张数:" + sheetCount);
//循环操作各个工作表
for (int i = 0; i < sheetCount; i++) {
HSSFSheet sheet = wb.getSheetAt(i);
int rowCount = sheet.getLastRowNum();
System.out.println("sheet" + i + "总共有" + rowCount + "行");
for (int j = 0; j <= rowCount; j++) {
HSSFRow row = sheet.getRow(j);
if (row != null) {
if (row.getCell(0) != null) {
row.getCell(0).setCellType(1);
//循环读取各个工作表第一列
String cellValue = row.getCell(0).getStringCellValue().trim();
//先查询看单词是否存在
rs = statement.executeQuery("SELECT * FROM wordlist WHERE word='" + cellValue + "';");
System.out.println("SELECT * FROM wordlist WHERE word='" + cellValue + "';");

if (rs.next()) {
//如果单词已经存在则无需重复插入
} else {
//如果单词不存在就将该单词插入到单词列表中
statement.executeUpdate("INSERT INTO wordlist (word) VALUES('" + cellValue + "');");
System.out.println("INSERT INTO wordlist (word) VALUES('" + cellValue + "');");
}
}
}
}
}
info.setText("EXCEL中的数据已经导入数据库完毕");
//记得打开的文件要及时关闭
wb.close();
inputExcel.close();
} catch (FileNotFoundException ex) {
Logger.getLogger(BingWord.class.getName()).log(Level.SEVERE, null, ex);
} catch (IOException | SQLException ex) {
Logger.getLogger(BingWord.class.getName()).log(Level.SEVERE, null, ex);
}
}
});

//将数据写入EXCEL文件保存,注意EXCEL字体需要设置为IPAPANNEW,否则音标会乱码
Tooltip tooltip2=new Tooltip();
tooltip2.setText("完整的单词列表和单词音标将从数据库输出到EXCEL表中");
tooltip2.setFont(new Font("Arial", 20));

btn2.setTooltip(tooltip2);
btn2.setText("SQLite -> Excel");
btn2.setOnAction(new EventHandler<ActionEvent>() {
@Override
public void handle(ActionEvent event) {
System.out.println("将SQLITE数据库中的数据导出到EXCEL文件中");
FileChooser fileChooser = new FileChooser();
fileChooser.setTitle("保存到EXCEL文件");
fileChooser.getExtensionFilters().add(new FileChooser.ExtensionFilter("EXCEL", "*.xls"));
fileChooser.setInitialFileName("wordlist.xls");
File file = fileChooser.showSaveDialog(primaryStage);
if (file == null) {
return;
}
String filePath = file.getPath();
System.out.println(filePath);
try {
//获取总记录数目maxNumber
rs = statement.executeQuery("SELECT count(*) FROM wordlist;");
while (rs.next()) {
maxNumber = rs.getInt(1);
System.out.println("数据库中总单词量:" + maxNumber);
}
//计算总共需要多少个工作表来存数据
int maxRowNumber = 65536;
int sheetCount = (int) (Math.floor(maxNumber / (maxRowNumber + 1)) + 1);
//目标EXCEL文件输出流
FileOutputStream outputExcel = new FileOutputStream(filePath);
//创建EXCEL表
HSSFWorkbook wb = new HSSFWorkbook();
//循环创建各个工作表和单元格
for (int i = 0; i < sheetCount; i++) {
HSSFSheet sheet = wb.createSheet("wordlist" + i);
//设置列宽
sheet.setColumnWidth(1, 6400);
sheet.setColumnWidth(2, 6400);
sheet.setColumnWidth(3, 6400);
for (int j = 0; j < maxRowNumber; j++) {
HSSFRow row = sheet.createRow(j);
HSSFCell cell0 = row.createCell(0);
HSSFCell cell1 = row.createCell(1);
HSSFCell cell2 = row.createCell(2);
HSSFCell cell3 = row.createCell(3);
cell0.setCellType(1);
cell1.setCellType(1);
cell2.setCellType(1);
cell3.setCellType(1);
}
}
//将数据库记录存入Excel各个对应单元格
rs = statement.executeQuery("SELECT * FROM wordlist;");
while (rs.next()) {
int currentLineNumber = 0;
String currentWordSpell = "";
String currentPronunciation1 = "";
String currentPronunciation2 = "";

currentLineNumber = rs.getInt(1);

if (rs.getString(2) != null) {
currentWordSpell = rs.getString(2).trim();
}
if (rs.getString(3) != null) {
currentPronunciation1 = rs.getString(3).trim();
}
if (rs.getString(4) != null) {
currentPronunciation2 = rs.getString(4).trim();
}

//计算当前记录应该写入第几个工作表第几行
int targetSheetNumber = (int) (Math.floor(currentLineNumber / (maxRowNumber + 1)));
int targetRowNumber = (currentLineNumber - 1) % maxRowNumber;

//将数据库的记录写入相应单元格中
Row currentRow = wb.getSheetAt(targetSheetNumber).getRow(targetRowNumber);
currentRow.getCell(0).setCellValue(currentLineNumber + "");
currentRow.getCell(1).setCellValue(currentWordSpell);
currentRow.getCell(2).setCellValue(currentPronunciation1);
currentRow.getCell(3).setCellValue(currentPronunciation2);

}
//写入文件并关闭
wb.write(outputExcel);
wb.close();
outputExcel.close();
info.setText("数据已经保存到文件 " + filePath);

} catch (FileNotFoundException ex) {
Logger.getLogger(BingWord.class.getName()).log(Level.SEVERE, null, ex);
} catch (IOException | SQLException ex) {
Logger.getLogger(BingWord.class.getName()).log(Level.SEVERE, null, ex);
}
}
});

//暂停按钮设置
Tooltip tooltip3=new Tooltip();
tooltip3.setText("暂停查找循环");
tooltip3.setFont(new Font("Arial", 20));

btnPause.setTooltip(tooltip3);
btnPause.setText("Pause");
btnPause.setOnAction(new EventHandler<ActionEvent>() {
@Override
public void handle(ActionEvent event) {
pause = true;
}
});

//WEBVIEW浏览器监听加载成功事件
webEngine.getLoadWorker().stateProperty()
.addListener(new ChangeListener<Worker.State>() {
@Override
public void changed(ObservableValue<? extends Worker.State> ov,
Worker.State oldState, Worker.State newState) {

//如果点击了暂停按钮则跳出程序循环
if (pause == true) {
return;
}
//每次浏览器加载完毕则执行操作
if (newState == Worker.State.SUCCEEDED) {
//获取浏览器中网页的HTML
String docContent = webEngine.getDocument().getDocumentElement().getTextContent();
String resultString = "";

//用正则表达式搜索获取HTML中的美式音标
Pattern patternUS = Pattern.compile("美\\S\\[[^\\[\\]]+\\]");
Matcher matcherUS = patternUS.matcher(docContent);
if (matcherUS.find()) {
resultString = matcherUS.group(0).replaceAll("[美\\s\\[\\]]", "");
//把单个单引号替换成两个单引号避免SQL报错
resultString = resultString.replaceAll("'", "''");
System.out.println("PronunciationUS:" + resultString);
info.setText("PronunciationUS: " + resultString);
try {
//如果找到了音标则写入数据库
statement.executeUpdate("UPDATE wordlist SET pronunciation1='" + resultString + "' WHERE word='" + currentWord + "';");
System.out.println("UPDATE wordlist SET pronunciation1='" + resultString + "' WHERE word='" + currentWord + "';");
} catch (SQLException ex) {
Logger.getLogger(BingWord.class.getName()).log(Level.SEVERE, null, ex);
}
} else {
System.out.println("NO MATCH");
info.setText("NO MATCH");
}

//用正则表达式搜索获取HTML中的英式音标
Pattern patternUK = Pattern.compile("英\\S\\[[^\\[\\]]+\\]");
Matcher matcherUK = patternUK.matcher(docContent);
if (matcherUK.find()) {
resultString = matcherUK.group(0).replaceAll("[英\\s\\[\\]]", "");
//把单个单引号替换成两个单引号避免SQL报错
resultString = resultString.replaceAll("'", "''");
System.out.println("PronunciationUK: " + resultString);
info.setText("PronunciationUK: " + resultString);
try {
//如果找到了音标则写入数据库
statement.executeUpdate("UPDATE wordlist SET pronunciation2='" + resultString + "' WHERE word='" + currentWord + "';");
System.out.println("UPDATE wordlist SET pronunciation2='" + resultString + "' WHERE word='" + currentWord + "';");
} catch (SQLException ex) {
Logger.getLogger(BingWord.class.getName()).log(Level.SEVERE, null, ex);
}
} else {
System.out.println("NO MATCH");
info.setText("NO MATCH");
}

//进入查询接龙式循环
recordPosition++;
inputText.setText(recordPosition + "");

if (recordPosition > maxNumber) {
info.setText("大功告成");
return;
}

try {
//继续查询程序
search();
} catch (SQLException ex) {
Logger.getLogger(BingWord.class.getName()).log(Level.SEVERE, null, ex);
}

}
}

});

//整理控件布局并显示到场景中
grid.add(btn, 0, 0);
grid.add(inputText, 1, 0);
grid.add(btn1, 2, 0);
grid.add(btn2, 3, 0);
grid.add(btnPause, 4, 0);
grid.add(browser, 0, 1, 5, 1);
grid.add(info, 0, 3, 5, 1);

grid.setPadding(new Insets(10, 10, 10, 10));
grid.setVgap(10);
grid.setHgap(10);

browser.setPrefSize(800, 600);

Scene scene = new Scene(grid, 600, 400);

scene.widthProperty().addListener(
new ChangeListener() {
@Override
public void changed(ObservableValue observable,
Object oldValue, Object newValue) {
Double width = (Double) newValue;
browser.setPrefWidth(width - 20);
}
}
);
scene.heightProperty().addListener(
new ChangeListener() {
@Override
public void changed(ObservableValue observable,
Object oldValue, Object newValue) {
Double height = (Double) newValue;
browser.setPrefHeight(height - 50);
}
}
);

primaryStage.setTitle("必应英汉词典音标爬虫 Benny");
primaryStage.setScene(scene);
primaryStage.show();
}

public static void main(String[] args) {
launch(args);
}

public void search() throws SQLException {
//按单词列表序号依次开始在必应词典网站中请求查询各个单词
rs = statement.executeQuery("SELECT * FROM wordlist WHERE id=" + recordPosition);
if (rs.next()) {
currentWord = rs.getString(2);
System.out.println("当前正在处理的单词是:" + currentWord);
webEngine.load("http://cn.bing.com/dict/search?q=" + currentWord);
}
}

}

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息