用到的几个包
bcmail-jdk14-132.jar/bcprov-jdk14-132.jar/checkstyle-all-4.2.jar/FontBox-0.1.0-dev.jar/lucene-core-2.0.0.jar/PDFBox-0.7.3.jar/poi-3.0-alpha3-20061212.jar/poi-contrib-3.0-alpha3-20061212.jar/poi-scratchpad-3.0-alpha3-20061212.jar
import java.io.BufferedReader;
import
java.io.FileInputStream;
import java.io.FileReader;
import
java.io.IOException;
import org.apache.poi.hslf.HSLFSlideShow;
import
org.apache.poi.hslf.model.Slide;
import
org.apache.poi.hslf.model.TextRun;
import
org.apache.poi.hslf.usermodel.SlideShow;
import
org.apache.poi.hssf.usermodel.HSSFCell;
import
org.apache.poi.hssf.usermodel.HSSFRow;
import
org.apache.poi.hssf.usermodel.HSSFSheet;
import
org.apache.poi.hssf.usermodel.HSSFWorkbook;
import
org.apache.poi.hwpf.HWPFDocument;
import
org.apache.poi.hwpf.usermodel.Paragraph;
import
org.apache.poi.hwpf.usermodel.Range;
import
org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.util.PDFTextStripper;
public class Test {
/**
* @param args
*/
public
static void p(Object obj) {
System.out.println(obj);
}
public static void main(String[] args)
{
try {
p(readPpt("src/1.dps"));
} catch (Exception e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//读取ppt
public static String
readPpt(String path) throws Exception {
StringBuffer content = new
StringBuffer("");
try {
SlideShow ss = new
SlideShow(new HSLFSlideShow(path));//
path为文件的全路径名称,建立SlideShow
Slide[] slides =
ss.getSlides();// 获得每一张幻灯片
for (int i = 0; i <
slides.length; i++) {
TextRun[] t =
slides[i].getTextRuns();// 为了取得幻灯片的文字内容,建立TextRun
for (int j = 0; j < t.length; j++) {
content.append(t[j].getText());//
这里会将文字内容加到content中去
}
content.append(slides[i].getTitle());
}
}
catch (Exception ex) {
System.out.println(ex.toString());
}
return
content.toString().trim();
}
// 读取xls
public static String
readXls(String path) throws Exception {
StringBuffer content =
new StringBuffer("");// 文档内容
HSSFWorkbook workbook = new
HSSFWorkbook(new FileInputStream(path));
int sheetCount =
workbook.getNumberOfSheets();// excel几张表
for (int i = 0; i <
sheetCount; i++) {// 遍历excel表
HSSFSheet sheet =
workbook.getSheetAt(i);// 对excel的第一个表引用
int rowCount =
sheet.getLastRowNum();// 取得最后一行的下标
for (int j = 0; j <
rowCount; j++) {// 循环每一行
HSSFRow row =
sheet.getRow(j);// 引用行
if (row == null)
{
continue;
} else
{
short cellNum =
row.getLastCellNum();
for (short m = 0; m <
cellNum; m++) {
HSSFCell cell =
row.getCell(m);// 引用行中的一个单元格
if (cell !=
null) {
int cellType =
cell.getCellType();
//
CELL_TYPE_NUMERIC 0 数字
//
CELL_TYPE_STRING 1 字符串
//
CELL_TYPE_FORMULA 2 公式
//
CELL_TYPE_BLANK 3 空格
//
CELL_TYPE_BOOLEAN 4 布尔值
//
CELL_TYPE_ERROR 5 错误
switch
(cellType) {
//
单元格类型为数字
case
HSSFCell.CELL_TYPE_NUMERIC:
// 取数字单元格的值
double d =
cell.getNumericCellValue();
content.append(String.valueOf(d) + "
");
break;
//
单元格类型为字符串
case
HSSFCell.CELL_TYPE_STRING:
String str =
cell.getStringCellValue().trim();
if (!str.equals("")) {
content.append(str + "
");
}
break;
//
单元格类型为公式
case
HSSFCell.CELL_TYPE_FORMULA:
// 不读取公式
// String formula =
cell.getCellFormula();
//
content = content + formula+"
";
break;
//
单元格类型为空白
case
HSSFCell.CELL_TYPE_BLANK:
break;
//
单元格类型为布尔值
case
HSSFCell.CELL_TYPE_BOOLEAN:
// boolean bool =
cell.getBooleanCellValue();
// content = content + bool+"
";
break;
//
单元格类型为错误
case
HSSFCell.CELL_TYPE_ERROR:
//
byte errorCode =
cell.getErrorCellValue();
//
content = content + errorCode+"
";
break;
default:
break;
}
} else
{
// content = content + "..." +"
";//没有数据的单元格使用...填充
}
}
}
content.append("\r");
}
}
return content.toString().trim();
}
// 读取pdf
public static String
readPdf(String path) throws Exception {
StringBuffer content =
new StringBuffer("");// 文档内容
FileInputStream fis = new
FileInputStream(path);
PDFParser p = new
PDFParser(fis);
p.parse();
PDFTextStripper ts =
new PDFTextStripper();
content.append(ts.getText(p.getPDDocument()));
fis.close();
return content.toString().trim();
}
// 读取word,只能读取文本内容 图片不行
public static
String readWord(String path) throws Exception {
StringBuffer content = new
StringBuffer("");// 文档内容
HWPFDocument doc = new HWPFDocument(new
FileInputStream(path));
Range range =
doc.getRange();
int paragraphCount = range.numParagraphs();//
段落
for (int i = 0; i < paragraphCount; i++) {//
遍历段落读取数据
Paragraph pp =
range.getParagraph(i);
content.append(pp.text());
}
return
content.toString().trim();
}
// 读取text
public static String
readTxt(String path) {
StringBuffer content = new
StringBuffer("");// 文档内容
try {
FileReader
reader = new FileReader(path);
BufferedReader br = new
BufferedReader(reader);
String s1 = null;
while ((s1 =
br.readLine()) != null) {
content.append(s1 +
"\r");
}
br.close();
reader.close();
} catch
(IOException e) {
e.printStackTrace();
}
return content.toString().trim();
}
}