public class OOXMLWordAndPowerPointTextHandler
extends org.xml.sax.helpers.DefaultHandler
This class does not generally check for namespaces, and it can be applied to PPTX and DOCX for text extraction.
This can be used to scrape content from charts. It currently ignores formula (<c:f/>) elements
This does not work with .xlsx or .vsdx.
TODO: move this into POI?
| Modifier and Type | Class and Description |
|---|---|
static class |
OOXMLWordAndPowerPointTextHandler.EditType |
static interface |
OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler |
| Modifier and Type | Field and Description |
|---|---|
static java.lang.String |
W_NS |
| Constructor and Description |
|---|
OOXMLWordAndPowerPointTextHandler(OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler bodyContentsHandler,
java.util.Map<java.lang.String,java.lang.String> hyperlinks) |
OOXMLWordAndPowerPointTextHandler(OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler bodyContentsHandler,
java.util.Map<java.lang.String,java.lang.String> hyperlinks,
boolean includeTextBox,
boolean concatenatePhoneticRuns) |
| Modifier and Type | Method and Description |
|---|---|
void |
characters(char[] ch,
int start,
int length) |
void |
endDocument() |
void |
endElement(java.lang.String uri,
java.lang.String localName,
java.lang.String qName) |
void |
endPrefixMapping(java.lang.String prefix) |
void |
ignorableWhitespace(char[] ch,
int start,
int length) |
void |
startDocument() |
void |
startElement(java.lang.String uri,
java.lang.String localName,
java.lang.String qName,
org.xml.sax.Attributes atts) |
void |
startPrefixMapping(java.lang.String prefix,
java.lang.String uri) |
public static final java.lang.String W_NS
public OOXMLWordAndPowerPointTextHandler(OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler bodyContentsHandler, java.util.Map<java.lang.String,java.lang.String> hyperlinks)
public OOXMLWordAndPowerPointTextHandler(OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler bodyContentsHandler, java.util.Map<java.lang.String,java.lang.String> hyperlinks, boolean includeTextBox, boolean concatenatePhoneticRuns)
public void startDocument()
throws org.xml.sax.SAXException
startDocument in interface org.xml.sax.ContentHandlerstartDocument in class org.xml.sax.helpers.DefaultHandlerorg.xml.sax.SAXExceptionpublic void endDocument()
throws org.xml.sax.SAXException
endDocument in interface org.xml.sax.ContentHandlerendDocument in class org.xml.sax.helpers.DefaultHandlerorg.xml.sax.SAXExceptionpublic void startPrefixMapping(java.lang.String prefix,
java.lang.String uri)
throws org.xml.sax.SAXException
startPrefixMapping in interface org.xml.sax.ContentHandlerstartPrefixMapping in class org.xml.sax.helpers.DefaultHandlerorg.xml.sax.SAXExceptionpublic void endPrefixMapping(java.lang.String prefix)
throws org.xml.sax.SAXException
endPrefixMapping in interface org.xml.sax.ContentHandlerendPrefixMapping in class org.xml.sax.helpers.DefaultHandlerorg.xml.sax.SAXExceptionpublic void startElement(java.lang.String uri,
java.lang.String localName,
java.lang.String qName,
org.xml.sax.Attributes atts)
throws org.xml.sax.SAXException
startElement in interface org.xml.sax.ContentHandlerstartElement in class org.xml.sax.helpers.DefaultHandlerorg.xml.sax.SAXExceptionpublic void endElement(java.lang.String uri,
java.lang.String localName,
java.lang.String qName)
throws org.xml.sax.SAXException
endElement in interface org.xml.sax.ContentHandlerendElement in class org.xml.sax.helpers.DefaultHandlerorg.xml.sax.SAXExceptionpublic void characters(char[] ch,
int start,
int length)
throws org.xml.sax.SAXException
characters in interface org.xml.sax.ContentHandlercharacters in class org.xml.sax.helpers.DefaultHandlerorg.xml.sax.SAXExceptionpublic void ignorableWhitespace(char[] ch,
int start,
int length)
throws org.xml.sax.SAXException
ignorableWhitespace in interface org.xml.sax.ContentHandlerignorableWhitespace in class org.xml.sax.helpers.DefaultHandlerorg.xml.sax.SAXExceptionCopyright © 2010 - 2023 Adobe. All Rights Reserved