首页 > 布布扣 > 详细

lecene.net实现pdf,doc,xls,ppt,htm,html等格式文件的检索

时间:2014-10-16 13:21:16      阅读:3411      评论:0      收藏:0      [点我收藏+]

代码如下,代码没有优化,仅实现功能
该代码复制到程序中不能直接使用,需要下载文章最后的例子,取得其中得dll后才可以

using System;
using System.Configuration;
using System.Data;
using System.Linq;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.HtmlControls;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Xml.Linq;
using System.Text;
using System.IO;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.QueryParsers;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis.Cn;

using org.pdfbox.pdmodel;
using org.pdfbox.util;
using System.Text.RegularExpressions;
public partial class _Default : System.Web.UI.Page
{
    public DateTime start = new DateTime();
    delegate void AsyncIndexDirectoryCaller(IndexWriter writer, FileInfo file);
    IndexSearcher searcher = null;
    protected void Page_Load(object sender, EventArgs e)
    {
        if (!IsPostBack)
            TextBox3.Text = Server.MapPath("doc");
    }

    #region 建立索引
    protected void Button2_Click(object sender, EventArgs e)
    {
        string INDEX_STORE_PATH = Server.MapPath("index");  //INDEX_STORE_PATH 为索引存储目录
        string INDEX_PATH = TextBox3.Text;  //INDEX_PATH 为搜索目录

        IndexWriter writer = null;
        try
        {
            writer = new IndexWriter(INDEX_STORE_PATH, new ChineseAnalyzer(), true);
            start = DateTime.Now;
            IndexDirectory(writer, new FileInfo(INDEX_PATH));
            writer.Optimize();
            writer.Close();
            TimeSpan s = DateTime.Now - start;
            TextBox1.Text = "提示:索引完成,共用时 " + s.TotalSeconds + " 秒\n";
        }
        catch (Exception ex)
        {
            TextBox4.Text = ex.Message.ToString();
        }

    }
    public void IndexDirectory(IndexWriter writer, FileInfo file)
    {
        if (Directory.Exists(file.FullName))
        {
            String[] files = Directory.GetFileSystemEntries(file.FullName);
            if (files != null)
            {
                for (int i = 0; i < files.Length; i++)
                {
                    IndexDirectory(writer, new FileInfo(files[i]));  //这里是一个递归 
                }
            }
        }
        else if (file.Extension.ToLower() == ".txt" || file.Extension.ToLower() == ".htm" || file.Extension.ToLower() == ".html" || file.Extension.ToLower() == ".pdf" || file.Extension.ToLower() == ".doc" || file.Extension.ToLower() == ".rtf" || file.Extension.ToLower() == ".ppt" || file.Extension.ToLower() == ".xls")
        {
            IndexFile(file, writer);
        }
    }
    private void IndexFile(FileInfo file, IndexWriter writer)
    {
        try
        {
            if (file.Extension.ToLower() == ".pdf")
            {
                Document doc = new Document();
                PDDocument pddoc = PDDocument.load(file.FullName);  
                PDFTextStripper stripper = new PDFTextStripper();
                doc.Add(new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));
                doc.Add(new Field("contents", stripper.getText(pddoc), Field.Store.NO, Field.Index.TOKENIZED));
                writer.AddDocument(doc);
            }
            else if (file.Extension.ToLower() == ".doc")
            {
                Document doc = new Document();
                string str = "";
                //
                Microsoft.Office.Interop.Word.ApplicationClass wordApp = new Microsoft.Office.Interop.Word.ApplicationClass();
                object filePath = file.FullName;
                object nullobj = System.Reflection.Missing.Value;
                Microsoft.Office.Interop.Word.Document docdoc = wordApp.Documents.Open(
                    ref filePath, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                    ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                    ref nullobj, ref nullobj, ref nullobj, ref nullobj);
                docdoc.ActiveWindow.Selection.WholeStory();
                str = docdoc.ActiveWindow.Selection.Text.ToString();
                docdoc.Close(ref nullobj, ref nullobj, ref nullobj);
                wordApp.Quit(ref nullobj, ref nullobj, ref nullobj);
                //

                doc.Add(new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));
                doc.Add(new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));
                writer.AddDocument(doc);
            }
            else if (file.Extension.ToLower() == ".rtf")    //word的方式可以解决rtf文件的读取
            {
                Document doc = new Document();
                string str = "";
                //
                Microsoft.Office.Interop.Word.ApplicationClass wordApp = new Microsoft.Office.Interop.Word.ApplicationClass();
                object filePath = file.FullName;
                object nullobj = System.Reflection.Missing.Value;
                Microsoft.Office.Interop.Word.Document docdoc = wordApp.Documents.Open(
                    ref filePath, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                    ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                    ref nullobj, ref nullobj, ref nullobj, ref nullobj);
                docdoc.ActiveWindow.Selection.WholeStory();
                str = docdoc.ActiveWindow.Selection.Text.ToString();
                docdoc.Close(ref nullobj, ref nullobj, ref nullobj);
                wordApp.Quit(ref nullobj, ref nullobj, ref nullobj);
                //
                
                doc.Add(new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));
                doc.Add(new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));
                writer.AddDocument(doc);
            }
            else if (file.Extension.ToLower() == ".ppt")
            {
                Document doc = new Document();
                string str = "";
                //
                PowerPoint.ApplicationClass pptApp = new PowerPoint.ApplicationClass();
                PowerPoint.Presentation pptPre = pptApp.Presentations.Open(file.FullName,
                            Microsoft.Office.Core.MsoTriState.msoTrue,
                            Microsoft.Office.Core.MsoTriState.msoFalse,
                            Microsoft.Office.Core.MsoTriState.msoFalse);
                foreach (PowerPoint.Slide slide in pptPre.Slides)
                {
                    foreach (PowerPoint.Shape shape in slide.Shapes)
                    {
                        try
                        {
                            str = str + shape.TextFrame.TextRange.Text;
                        }
                        catch { }
                    }
                }
                pptPre.Close();
                pptApp.Quit();
                //

                doc.Add(new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));
                doc.Add(new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));
                writer.AddDocument(doc);
            }
            else if (file.Extension.ToLower() == ".xls")
            {
                Document doc = new Document();
                string str = "";
                //
                Microsoft.Office.Interop.Excel.Application xApp = new Microsoft.Office.Interop.Excel.ApplicationClass();
                //xApp.Visible = true;

                object nullobj = System.Reflection.Missing.Value;
                Microsoft.Office.Interop.Excel.Workbook xBook = xApp.Workbooks._Open(file.FullName,
                nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj);
                Microsoft.Office.Interop.Excel.Worksheet xSheet;
                int rcount, ccount;
                for (int i = 0; i < xBook.Sheets.Count; i++)
                {
                    xSheet = (Microsoft.Office.Interop.Excel.Worksheet)xBook.Sheets[i + 1];
                    rcount = xSheet.UsedRange.Rows.Count;
                    ccount = xSheet.UsedRange.Columns.Count;
                    for (int m = 0; m < rcount; m++)
                    {
                        for (int n = 0; n < ccount; n++)
                        {
                            str = str + ((Microsoft.Office.Interop.Excel.Range)xSheet.Cells[m + 1, n + 1]).Value2;
                        }
                    }
                }
                xSheet = null;
                xBook.Close(nullobj, nullobj, nullobj);
                xApp.Quit();
                //

                doc.Add(new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));
                doc.Add(new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));
                writer.AddDocument(doc);
            }
            else if (file.Extension.ToLower() == ".htm" || file.Extension.ToLower() == ".html")
            {
                Document doc = new Document();
                string str = "";
                str = NoHTML(File.ReadAllText(file.FullName));
                doc.Add(new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));
                doc.Add(new Field("contents"new StreamReader(file.FullName, System.Text.Encoding.Default)));
                writer.AddDocument(doc);
            }
            else    //默认是文本文件
            {
                Document doc = new Document();
                doc.Add(new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));
                doc.Add(new Field("contents"new StreamReader(file.FullName, System.Text.Encoding.Default)));
                writer.AddDocument(doc);
            }
        }
        catch (FileNotFoundException fnfe)
        {
            TextBox4.Text = TextBox4.Text + fnfe.Message + "\n";
            return;
        }
    }
    public static string NoHTML(string Htmlstring)//过滤调html的标签
    {
        //删除脚本 
        Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>""", RegexOptions.IgnoreCase);
        //删除HTML 
        Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>""", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+""", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"-->""", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"<!--.*""", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);""\"", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);""&", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);""<", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);"">", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);"" ", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);""\xa1", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);""\xa2", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);""\xa3", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);""\xa9", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);""", RegexOptions.IgnoreCase);
        Htmlstring.Replace("<""");
        Htmlstring.Replace(">""");
        Htmlstring.Replace("\r\n""");
        Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
        return Htmlstring;
    }
    #endregion
    #region 搜索
    protected void Button1_Click(object sender, EventArgs e)
    {
        string INDEX_STORE_PATH = Server.MapPath("index");  //INDEX_STORE_PATH 为索引存储目录
        string KEYWORD = TextBox2.Text;
        try
        {
            searcher = new IndexSearcher(INDEX_STORE_PATH);
            QueryParser q = new QueryParser("contents"new ChineseAnalyzer());
            Query query = q.Parse(KEYWORD);

            Hits hits = searcher.Search(query);
            printResult(hits);
            searcher.Close();
        }
        catch (Exception ex)
        {
            TextBox4.Text = TextBox4.Text + ex.Message.ToString();
        }
    }
    void printResult(Hits h)
    {
        string str = "";
        if (h.Length() == 0)
        {
            str = str + "对不起,没有搜索到你要的结果。\n";
        }
        else
        {
            for (int i = 0; i < h.Length(); i++)
            {
                try
                {
                    Document doc = h.Doc(i);
                    str = str + "这是第" + (i + 1) + "个搜索结果,文件路径为: " + doc.Get("filename") + "\n";
                }
                catch (Exception ex)
                {
                    TextBox4.Text = TextBox4.Text + ex.Message;
                }
            }
        }
        str = str + "---------------------------\n";
        TextBox1.Text = str;
    }
    #endregion
}


完整demo下载,点击下载

lecene.net实现pdf,doc,xls,ppt,htm,html等格式文件的检索

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!