We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
using DocumentFormat.OpenXml.Packaging; using NPOI.HWPF; using NPOI.HWPF.Extractor; using System; using System.IO; using System.Runtime.InteropServices; using System.Text; using System.Xml; using WordInterop = Microsoft.Office.Interop.Word; namespace MsWordTextExtractor { public static class DocxTextExtractor { public static string Extract(string filePath) { try { return ReadAllTextParts(filePath); } catch { try { return NpoiDocExtractor(filePath); } catch { try { return WordInteropExtractor(filePath); } catch (Exception ex) { Console.WriteLine(ex.Message); return string.Empty; } } } } static string ReadAllTextParts(string filePath) { StringBuilder stb = new StringBuilder(); using (WordprocessingDocument wordprocessingDocument = WordprocessingDocument.Open(filePath, false)) { var mainPart = wordprocessingDocument.MainDocumentPart; stb.AppendLine(ReadTextPart(mainPart.GetStream())); if (mainPart.FootnotesPart != null) { string footNotes = ReadFootnotesPart(mainPart.FootnotesPart.GetStream()); if (!string.IsNullOrEmpty(footNotes)) { stb.AppendLine(); stb.AppendLine(footNotes); } } if (mainPart.EndnotesPart != null) { string footNotes = ReadFootnotesPart(mainPart.FootnotesPart.GetStream()); if (!string.IsNullOrEmpty(footNotes)) { stb.AppendLine(); stb.AppendLine(footNotes); } } } return stb.ToString(); } static string ReadTextPart(Stream partStream) { NameTable nameTable = new NameTable(); XmlNamespaceManager xmlNamespaceManager = new XmlNamespaceManager(nameTable); xmlNamespaceManager.AddNamespace("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main"); StringBuilder stringBuilder = new StringBuilder(); XmlDocument xmlDocument = new XmlDocument(nameTable); xmlDocument.Load(partStream); XmlNodeList paragraphNodes = xmlDocument.SelectNodes("//w:p", xmlNamespaceManager); foreach (XmlNode paragraphNode in paragraphNodes) { ReadTextContent(stringBuilder, paragraphNode, xmlNamespaceManager); stringBuilder.Append(Environment.NewLine); } return stringBuilder.ToString().Trim(); } static string ReadFootnotesPart(Stream partStream) { NameTable nameTable = new NameTable(); XmlNamespaceManager xmlNamespaceManager = new XmlNamespaceManager(nameTable); xmlNamespaceManager.AddNamespace("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main"); StringBuilder stringBuilder = new StringBuilder(); XmlDocument xmlDocument = new XmlDocument(nameTable); xmlDocument.Load(partStream); XmlNodeList footnoteNodes = xmlDocument.SelectNodes("//w:footnote | .//w:endnote", xmlNamespaceManager); foreach (XmlNode footnoteNode in footnoteNodes) { string footnoteId = footnoteNode.Attributes["w:id"].Value; if (footnoteId == "-1" || footnoteId == "0") { continue; } stringBuilder.Append($"{footnoteId}"); ReadTextContent(stringBuilder, footnoteNode, xmlNamespaceManager); stringBuilder.AppendLine(); } return stringBuilder.ToString().Trim(); } static void ReadTextContent(StringBuilder stringBuilder, XmlNode xmlNode, XmlNamespaceManager xmlNamespaceManager) { XmlNodeList textNodes = xmlNode.SelectNodes(".//w:t | .//w:tab | .//w:br | .//w:footnoteReference | .//w:numPr", xmlNamespaceManager); foreach (XmlNode textNode in textNodes) { switch (textNode.Name) { case "w:t": stringBuilder.Append(textNode.InnerText); break; case "w:tab": stringBuilder.Append("\t"); break; case "w:br": stringBuilder.Append("\v"); break; case "w:footnoteReference": string footnoteId = textNode.Attributes["w:id"].Value; stringBuilder.Append($"{footnoteId}"); break; case "w:numPr": XmlNode ilvlNode = textNode.SelectSingleNode(".//w:ilvl", xmlNamespaceManager); XmlNode numIdNode = textNode.SelectSingleNode(".//w:numId", xmlNamespaceManager); if (ilvlNode != null && numIdNode != null) { stringBuilder.Append("*"); } break; } } } static string NpoiDocExtractor(string filePath) { using (FileStream fileStream = new FileStream(filePath, FileMode.Open, FileAccess.Read)) { HWPFDocument doc = new HWPFDocument(fileStream); WordExtractor extractor = new WordExtractor(doc); return extractor.Text; } } public static string WordInteropExtractor(string filePath) { string tempFilePath = Path.Combine(Path.GetTempPath(), Path.GetFileNameWithoutExtension(filePath) + ".txt"); try { using (WordApp wordApp = new WordApp()) { WordInterop.Document doc = null; bool isFileAlreadyOpen = false; foreach (WordInterop.Document openDoc in wordApp.App.Documents) { if (openDoc.FullName.Equals(Path.GetFullPath(filePath), StringComparison.OrdinalIgnoreCase)) { doc = openDoc; isFileAlreadyOpen = true; break; } } if (doc == null) doc = wordApp.App.Documents.Open(filePath, ReadOnly: true, Visible: false); var originalFormat = doc.SaveFormat; doc.SaveAs2(tempFilePath, WordInterop.WdSaveFormat.wdFormatUnicodeText, Encoding: 65001, AddToRecentFiles: false); if (isFileAlreadyOpen) doc.SaveAs2(filePath, originalFormat); if (doc != null && !isFileAlreadyOpen) doc.Close(WordInterop.WdSaveOptions.wdDoNotSaveChanges); } return File.ReadAllText(tempFilePath); } finally { if (File.Exists(tempFilePath)) File.Delete(tempFilePath); } } } class WordApp : IDisposable { public Microsoft.Office.Interop.Word.Application App; bool isNewApp; public WordApp() { try { App = (WordInterop.Application)Marshal.GetActiveObject("Word.Application"); } catch (COMException) { App = new WordInterop.Application(); isNewApp = true; } } public void Dispose() { if (isNewApp && App != null) { App.Quit(); Marshal.ReleaseComObject(App); } } } }
The text was updated successfully, but these errors were encountered:
Why do you use HWPF for docx? Toxy has Word2007DocumentParser and Word2007TextParser
Sorry, something went wrong.
No branches or pull requests
The text was updated successfully, but these errors were encountered: