package com.mathworks.mlwidgets.help.search.lucene;

import com.mathworks.mlwidgets.html.HTMLUtils;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.xerces.parsers.AbstractSAXParser;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.XNIException;
import org.cyberneko.html.HTMLConfiguration;
import org.cyberneko.html.HTMLEntities;
import org.xml.sax.InputSource;

/* loaded from: input_file:com/mathworks/mlwidgets/help/search/lucene/NekoHtmlSaxDocumentHandler.class */
public class NekoHtmlSaxDocumentHandler implements DocumentHandler {
    private static final Pattern ENTITIES_PATTERN = Pattern.compile("\\&\\S*;");
    private boolean fAllowMultipleSections;
    private static final String HEADER = "head";
    private static final String BODY = "body";
    private static final String TITLE = "title";
    private static final String ANCHOR = "a";
    private static final String TABLE = "table";
    private static final String IMAGE = "img";
    private static final String FRAMESET = "frameset";
    private static final String PARAGRAPH = "p";
    private static final String FONT = "font";
    private static final String IMAGE_SOURCE = "src";
    private static final String NAME = "name";
    private static final String ID = "id";
    private static final String SEE_ALSO = "See Also";
    private static final String SCRIPT = "script";
    private static final char NBSP = 160;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/mathworks/mlwidgets/help/search/lucene/NekoHtmlSaxDocumentHandler$NekoHtmlSaxParser.class */
    public class NekoHtmlSaxParser extends AbstractSAXParser {
        private boolean fInHeader;
        private boolean fInBody;
        private boolean fInTitle;
        private boolean fInSeeAlso;
        private boolean fInScript;
        private int fTableDepth;
        private boolean fInPrevNext;
        private boolean fInCopyright;
        private boolean fIsSectionComment;
        private boolean fTitleComplete;
        private SectionHeadingTag fHeadingTag;
        private String fTitleTag;
        private StringBuilder fTableContent;
        private StringBuilder fBodyContent;
        private String fCurrentAnchor;
        private boolean fRefTitle;
        private boolean fInRefPageSummary;
        private boolean fReference;
        private boolean fStudent;
        private List<DocumentInfo> fDocInfos;
        private DocumentInfo fCurrentDocInfo;
        private boolean fDoNotIndex;

        private NekoHtmlSaxParser() {
            super(new HTMLConfiguration());
            this.fInHeader = false;
            this.fInBody = false;
            this.fInTitle = false;
            this.fInSeeAlso = false;
            this.fInScript = false;
            this.fTableDepth = 0;
            this.fInPrevNext = false;
            this.fInCopyright = false;
            this.fIsSectionComment = false;
            this.fTitleComplete = false;
            this.fHeadingTag = null;
            this.fTableContent = null;
            this.fBodyContent = null;
            this.fRefTitle = false;
            this.fReference = false;
            this.fStudent = false;
            this.fDocInfos = new ArrayList();
            this.fDoNotIndex = false;
            startNewDocument(false);
        }

        public void characters(XMLString xMLString, Augmentations augmentations) throws XNIException {
            String replaceAll = xMLString.toString().replace((char) 160, ' ').replaceAll("\\s+", " ");
            if (replaceAll.length() == 0) {
                return;
            }
            if (replaceAll.equalsIgnoreCase(NekoHtmlSaxDocumentHandler.SEE_ALSO)) {
                this.fInSeeAlso = true;
            }
            if (this.fHeadingTag != null) {
                if (allowMultipleSections() && this.fHeadingTag.startsNewDocument() && this.fCurrentDocInfo.getTitle() != null && this.fCurrentAnchor != null) {
                    startNewDocument(true);
                }
                if (this.fTitleComplete) {
                    this.fCurrentDocInfo.addSectionHeading(NekoHtmlSaxDocumentHandler.this.fixEntities(replaceAll));
                } else {
                    this.fCurrentDocInfo.appendToTitle(NekoHtmlSaxDocumentHandler.this.fixEntities(replaceAll));
                }
            }
            if (this.fInTitle) {
                if (this.fTitleTag == null) {
                    this.fTitleTag = replaceAll;
                } else {
                    this.fTitleTag += replaceAll;
                }
                String upperCase = replaceAll.toUpperCase();
                if (upperCase.indexOf(": TABLE OF CONTENTS") >= 0 || upperCase.indexOf(": INDEX") >= 0) {
                    this.fDoNotIndex = true;
                }
            } else if (includeText()) {
                appendContent(replaceAll);
                if (this.fInRefPageSummary) {
                    this.fCurrentDocInfo.appendToRefPageSummary(replaceAll);
                }
            }
            this.fCurrentAnchor = null;
        }

        private void appendContent(String str) {
            if (this.fInPrevNext || this.fInCopyright) {
                return;
            }
            if (this.fTableDepth > 0 && this.fTableContent != null) {
                this.fTableContent.append(str);
            } else if (this.fBodyContent == null) {
                this.fBodyContent = new StringBuilder(str);
            } else {
                this.fBodyContent.append(str);
            }
        }

        private boolean includeText() {
            return (!this.fInBody || this.fInSeeAlso || this.fInScript) ? false : true;
        }

        public void startElement(QName qName, XMLAttributes xMLAttributes, Augmentations augmentations) {
            if (qName.rawname.equalsIgnoreCase("meta")) {
                String value = xMLAttributes.getValue("name");
                String value2 = xMLAttributes.getValue("content");
                if (value != null && value2 != null) {
                    if (value.equalsIgnoreCase("chunktype") && (value2.equalsIgnoreCase("refpage") || value2.equalsIgnoreCase("classrefpage"))) {
                        this.fReference = true;
                    } else if (value.equalsIgnoreCase("sw-edition") && value2.equalsIgnoreCase(LuceneDocSearchConstants.STUDENT_PAGE_FIELD)) {
                        this.fStudent = true;
                    }
                }
            } else if (qName.rawname.equalsIgnoreCase(NekoHtmlSaxDocumentHandler.HEADER)) {
                this.fInHeader = true;
            } else if (qName.rawname.equalsIgnoreCase("body")) {
                this.fInBody = true;
            } else if (this.fInHeader && qName.rawname.equalsIgnoreCase("title")) {
                this.fInTitle = true;
            } else if (qName.rawname.equalsIgnoreCase(NekoHtmlSaxDocumentHandler.FRAMESET)) {
                this.fDoNotIndex = true;
            } else if (this.fInBody) {
                startBodyElement(qName, xMLAttributes);
            }
            super.startElement(qName, xMLAttributes, augmentations);
        }

        private void startBodyElement(QName qName, XMLAttributes xMLAttributes) {
            appendContent(" ");
            if (qName.rawname.equalsIgnoreCase(NekoHtmlSaxDocumentHandler.ANCHOR)) {
                this.fCurrentAnchor = xMLAttributes.getValue("name");
                return;
            }
            if (qName.rawname.equalsIgnoreCase(NekoHtmlSaxDocumentHandler.SCRIPT)) {
                this.fInScript = true;
                return;
            }
            if (SectionHeadingTag.isSectionHeadingTag(qName.rawname.toLowerCase())) {
                this.fHeadingTag = SectionHeadingTag.valueOf(qName.rawname.toLowerCase());
                this.fIsSectionComment = false;
                this.fInSeeAlso = false;
                this.fCurrentAnchor = xMLAttributes.getValue(NekoHtmlSaxDocumentHandler.ID);
                String value = xMLAttributes.getValue("class");
                if (value == null || !value.equalsIgnoreCase("reftitle")) {
                    return;
                }
                this.fRefTitle = true;
                return;
            }
            if (qName.rawname.equalsIgnoreCase(NekoHtmlSaxDocumentHandler.TABLE)) {
                if (this.fTableDepth == 0) {
                    this.fTableContent = new StringBuilder();
                }
                this.fTableDepth++;
                return;
            }
            if (qName.rawname.equalsIgnoreCase(NekoHtmlSaxDocumentHandler.IMAGE)) {
                String value2 = xMLAttributes.getValue(NekoHtmlSaxDocumentHandler.IMAGE_SOURCE);
                if (value2 == null || !value2.equalsIgnoreCase("b_prev.gif")) {
                    return;
                }
                this.fInPrevNext = true;
                this.fTableContent = null;
                return;
            }
            if (qName.rawname.equalsIgnoreCase(NekoHtmlSaxDocumentHandler.PARAGRAPH)) {
                String value3 = xMLAttributes.getValue("class");
                if (value3 != null && value3.equalsIgnoreCase("copy")) {
                    this.fInCopyright = true;
                }
                this.fInRefPageSummary = this.fRefTitle;
            }
        }

        private boolean allowMultipleSections() {
            return NekoHtmlSaxDocumentHandler.this.fAllowMultipleSections && !isReferencePage();
        }

        private void startNewDocument(boolean z) {
            saveDocument();
            this.fBodyContent = null;
            this.fCurrentDocInfo = new DocumentInfo();
            this.fTitleComplete = false;
            if (z) {
                this.fCurrentDocInfo.setAnchor(this.fCurrentAnchor);
            }
        }

        private void saveDocument() {
            if (this.fCurrentDocInfo == null || this.fDoNotIndex) {
                return;
            }
            String body = getBody();
            if (body != null) {
                this.fCurrentDocInfo.setBody(body.trim());
            } else {
                this.fCurrentDocInfo.setBody(null);
            }
            this.fDocInfos.add(this.fCurrentDocInfo);
        }

        public void endElement(QName qName, Augmentations augmentations) throws XNIException {
            if (this.fInBody) {
                appendContent(" ");
            }
            if (qName.rawname.equalsIgnoreCase(NekoHtmlSaxDocumentHandler.HEADER)) {
                this.fInHeader = false;
                return;
            }
            if (qName.rawname.equalsIgnoreCase("body")) {
                this.fInBody = false;
                return;
            }
            if (this.fInHeader && qName.rawname.equalsIgnoreCase("title")) {
                this.fInTitle = false;
                return;
            }
            if (qName.rawname.equalsIgnoreCase(NekoHtmlSaxDocumentHandler.SCRIPT)) {
                this.fInScript = false;
                return;
            }
            if (qName.rawname.equalsIgnoreCase(NekoHtmlSaxDocumentHandler.TABLE)) {
                this.fTableDepth--;
                if (!this.fInPrevNext && this.fTableContent != null) {
                    appendContent(this.fTableContent.toString());
                }
                if (this.fTableDepth == 0) {
                    this.fInPrevNext = false;
                    this.fTableContent = null;
                    return;
                }
                return;
            }
            if (qName.rawname.equalsIgnoreCase(NekoHtmlSaxDocumentHandler.PARAGRAPH)) {
                this.fInCopyright = false;
                this.fRefTitle = false;
                this.fInRefPageSummary = false;
            } else if (SectionHeadingTag.isSectionHeadingTag(qName.rawname.toLowerCase())) {
                this.fHeadingTag = null;
                this.fTitleComplete = true;
            } else if (qName.rawname.equalsIgnoreCase(NekoHtmlSaxDocumentHandler.FONT) && this.fIsSectionComment) {
                this.fHeadingTag = null;
                this.fTitleComplete = true;
            }
        }

        public void comment(XMLString xMLString, Augmentations augmentations) {
            String lowerCase = xMLString.toString().trim().toLowerCase();
            if (SectionHeadingTag.isSectionHeadingTag(lowerCase)) {
                this.fHeadingTag = SectionHeadingTag.valueOf(lowerCase);
                this.fIsSectionComment = true;
            }
            super.comment(xMLString, augmentations);
        }

        public void endDocument(Augmentations augmentations) throws XNIException {
            super.endDocument(augmentations);
            saveDocument();
            String[] parseTitleTag = parseTitleTag();
            String str = parseTitleTag[0];
            String str2 = parseTitleTag[1];
            if (str2 == null || str2.equals(str)) {
                str2 = parseTitleTag[2];
            }
            if (str2 == null) {
                str2 = str;
            }
            if (str2 != null) {
                String fixEntities = NekoHtmlSaxDocumentHandler.this.fixEntities(str2.trim());
                for (DocumentInfo documentInfo : this.fDocInfos) {
                    documentInfo.setSection(NekoHtmlSaxDocumentHandler.this.fixEntities(fixEntities));
                    documentInfo.setReferencePage(isReferencePage());
                    documentInfo.setStudent(isStudentPage());
                    if (documentInfo.getTitle() == null || isReferencePage()) {
                        if (str != null) {
                            documentInfo.setTitle(NekoHtmlSaxDocumentHandler.this.fixEntities(str));
                        }
                    }
                }
            }
        }

        private String[] parseTitleTag() {
            String[] strArr = new String[3];
            String str = this.fTitleTag;
            if (str != null) {
                String trim = str.trim();
                int lastIndexOf = trim.lastIndexOf("(");
                int lastIndexOf2 = trim.lastIndexOf(")");
                if (lastIndexOf >= 0 && lastIndexOf2 > lastIndexOf) {
                    strArr[2] = trim.substring(lastIndexOf + 1, lastIndexOf2);
                    trim = trim.substring(0, lastIndexOf);
                }
                int indexOf = trim.indexOf("::");
                if (indexOf > -1) {
                    strArr[1] = trim.substring(indexOf + 2).trim();
                    strArr[0] = trim.substring(0, indexOf);
                } else {
                    strArr[0] = trim;
                }
            }
            return strArr;
        }

        /* JADX INFO: Access modifiers changed from: private */
        public List<DocumentInfo> getDocumentInfo() {
            return this.fDocInfos;
        }

        private String getBody() {
            if (this.fBodyContent != null) {
                return this.fBodyContent.toString().replaceAll("\\s+", " ");
            }
            return null;
        }

        private boolean isReferencePage() {
            return this.fReference;
        }

        private boolean isStudentPage() {
            return this.fStudent;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/mathworks/mlwidgets/help/search/lucene/NekoHtmlSaxDocumentHandler$SectionHeadingTag.class */
    public enum SectionHeadingTag {
        h1,
        h2,
        h3,
        h4;

        /* JADX INFO: Access modifiers changed from: private */
        public boolean startsNewDocument() {
            return this != h4;
        }

        /* JADX INFO: Access modifiers changed from: private */
        public static boolean isSectionHeadingTag(String str) {
            for (SectionHeadingTag sectionHeadingTag : values()) {
                if (str.equalsIgnoreCase(sectionHeadingTag.toString())) {
                    return true;
                }
            }
            return false;
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public NekoHtmlSaxDocumentHandler(boolean z) {
        this.fAllowMultipleSections = z;
    }

    @Override // com.mathworks.mlwidgets.help.search.lucene.DocumentHandler
    public List<DocumentInfo> getDocumentInfo(InputStream inputStream) throws DocumentHandlerException {
        try {
            NekoHtmlSaxParser nekoHtmlSaxParser = new NekoHtmlSaxParser();
            nekoHtmlSaxParser.parse(new InputSource(inputStream));
            return nekoHtmlSaxParser.getDocumentInfo();
        } catch (Exception e) {
            e.printStackTrace();
            throw new DocumentHandlerException("An error occurred while parsing an HTML document", e);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    public String fixEntities(String str) {
        Matcher matcher = ENTITIES_PATTERN.matcher(str);
        StringBuffer stringBuffer = new StringBuffer();
        int i = 0;
        while (true) {
            int i2 = i;
            if (!matcher.find()) {
                stringBuffer.append(str.substring(i2));
                return stringBuffer.toString();
            }
            stringBuffer.append(str.substring(i2, matcher.start()));
            String group = matcher.group();
            String substring = group.substring(1, group.length() - 1);
            if (substring.startsWith("#")) {
                try {
                    stringBuffer.append((char) Integer.parseInt(substring.substring(1)));
                } catch (Exception e) {
                    stringBuffer.append(group);
                }
            } else {
                int i3 = HTMLEntities.get(substring);
                if (i3 > -1) {
                    stringBuffer.append((char) i3);
                } else {
                    stringBuffer.append(group);
                }
            }
            i = matcher.end();
        }
    }

    public static void main(String[] strArr) throws Exception {
        NekoHtmlSaxDocumentHandler nekoHtmlSaxDocumentHandler = new NekoHtmlSaxDocumentHandler(true);
        for (String str : strArr) {
            System.out.println("*** Indexing document: " + str + " ***");
            for (DocumentInfo documentInfo : nekoHtmlSaxDocumentHandler.getDocumentInfo(str.startsWith("jar:file:") ? new ByteArrayInputStream(HTMLUtils.getSource(str).getBytes()) : new FileInputStream(new File(str)))) {
                System.out.println("Title: " + documentInfo.getTitle());
                System.out.println("Section: " + documentInfo.getSection());
                System.out.println("Anchor: " + documentInfo.getAnchor());
                System.out.println("Ref page? " + documentInfo.isReferencePage());
                System.out.println("Content: " + documentInfo.getBody());
                System.out.println("Section headings: " + documentInfo.getSectionHeadings());
                System.out.println("------------------------------------------------");
            }
        }
    }
}
