package me.angrybyte.goose;

import android.util.Log;
import com.bria.common.controller.im.refactoring.BuddyKeyUtils;
import com.google.firebase.analytics.FirebaseAnalytics;
import java.io.File;
import java.net.MalformedURLException;
import java.net.URL;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import me.angrybyte.goose.cleaners.DefaultDocumentCleaner;
import me.angrybyte.goose.cleaners.DocumentCleaner;
import me.angrybyte.goose.images.BestImageGuesser;
import me.angrybyte.goose.images.ImageExtractor;
import me.angrybyte.goose.network.GooseDownloader;
import me.angrybyte.goose.outputformatters.DefaultOutputFormatter;
import me.angrybyte.goose.outputformatters.Entities;
import me.angrybyte.goose.outputformatters.OutputFormatter;
import me.angrybyte.goose.texthelpers.ReplaceSequence;
import me.angrybyte.goose.texthelpers.StopWords;
import me.angrybyte.goose.texthelpers.StringReplacement;
import me.angrybyte.goose.texthelpers.StringSplitter;
import me.angrybyte.goose.texthelpers.WordStats;
import me.angrybyte.goose.texthelpers.string;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.jsoup.select.Selector;

/* loaded from: classes3.dex */
public class ContentExtractor {
    private static final String A_REL_TAG_SELECTOR = "a[rel=tag], a[href*=/tag/]";
    private Configuration config;
    private DocumentCleaner documentCleaner;
    private ImageExtractor imageExtractor;
    private String linkHash;
    private OutputFormatter outputFormatter;
    private static final StringReplacement MOTLEY_REPLACEMENT = StringReplacement.compile("&#65533;", "");
    private static final StringReplacement ESCAPED_FRAGMENT_REPLACEMENT = StringReplacement.compile("#!", "?_escaped_fragment_=");
    private static final ReplaceSequence TITLE_REPLACEMENTS = ReplaceSequence.create("&raquo;").append("»");
    private static final StringSplitter PIPE_SPLITTER = new StringSplitter(BuddyKeyUtils.REGEX_SMALL_SEPARATOR);
    private static final StringSplitter DASH_SPLITTER = new StringSplitter(" - ");
    private static final StringSplitter ARROWS_SPLITTER = new StringSplitter("»");
    private static final StringSplitter COLON_SPLITTER = new StringSplitter(":");
    private static final StringSplitter SPACE_SPLITTER = new StringSplitter(" ");
    private static final Set<String> NO_STRINGS = new HashSet(0);

    public ContentExtractor(Configuration configuration) {
        this.config = configuration;
    }

    private Element addSiblings(Element element) {
        int baselineScoreForSiblings = getBaselineScoreForSiblings(element);
        Element previousElementSibling = element.previousElementSibling();
        while (previousElementSibling != null) {
            if (previousElementSibling.tagName().equals("p")) {
                element.child(0).before(previousElementSibling.outerHtml());
                previousElementSibling = previousElementSibling.previousElementSibling();
            } else {
                int i = 0;
                Elements elementsByTag = previousElementSibling.getElementsByTag("p");
                if (elementsByTag.first() == null) {
                    previousElementSibling = previousElementSibling.previousElementSibling();
                } else {
                    Iterator<Element> it = elementsByTag.iterator();
                    while (it.hasNext()) {
                        Element next = it.next();
                        if (((float) (baselineScoreForSiblings * 0.3d)) < StopWords.getStopWordCount(next.text()).getStopWordCount()) {
                            element.child(i).before("<p>" + next.text() + "<p>");
                            i++;
                        }
                    }
                    previousElementSibling = previousElementSibling.previousElementSibling();
                }
            }
        }
        return element;
    }

    private static String bytesToLowerCaseHex(byte[] bArr) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < bArr.length; i++) {
            int i2 = (bArr[i] >>> 4) & 15;
            int i3 = 0;
            while (true) {
                if (i2 < 0 || i2 > 9) {
                    sb.append((char) ((i2 - 10) + 97));
                } else {
                    sb.append((char) (i2 + 48));
                }
                i2 = bArr[i] & 15;
                int i4 = i3 + 1;
                if (i3 >= 1) {
                    break;
                }
                i3 = i4;
            }
        }
        return sb.toString();
    }

    private Element calculateBestNodeBasedOnClustering(Document document) {
        Element element = null;
        ArrayList<Element> nodesToCheck = getNodesToCheck(document);
        double d = 1.0d;
        int i = 0;
        int i2 = 0;
        HashSet<Element> hashSet = new HashSet();
        ArrayList arrayList = new ArrayList();
        Iterator<Element> it = nodesToCheck.iterator();
        while (it.hasNext()) {
            Element next = it.next();
            WordStats stopWordCount = StopWords.getStopWordCount(next.text());
            boolean isHighLinkDensity = isHighLinkDensity(next);
            if (stopWordCount.getStopWordCount() > 2 && !isHighLinkDensity) {
                arrayList.add(next);
            }
        }
        int size = arrayList.size();
        double d2 = size * 0.25d;
        Iterator it2 = arrayList.iterator();
        while (it2.hasNext()) {
            Element element2 = (Element) it2.next();
            float f = 0.0f;
            if (isOkToBoost(element2) && i >= 0) {
                f = (float) ((1.0d / d) * 50.0d);
                d += 1.0d;
            }
            if (size > 15 && size - i2 <= d2) {
                f = -((float) Math.pow(((float) d2) - (size - i2), 2.0d));
                if (Math.abs(f) + 0 > 40.0f) {
                    f = 5.0f;
                }
            }
            int stopWordCount2 = (int) (StopWords.getStopWordCount(element2.text()).getStopWordCount() + f);
            updateScore(element2.parent(), stopWordCount2);
            updateScore(element2.parent().parent(), stopWordCount2 / 2);
            updateNodeCount(element2.parent(), 1);
            updateNodeCount(element2.parent().parent(), 1);
            if (!hashSet.contains(element2.parent())) {
                hashSet.add(element2.parent());
            }
            if (!hashSet.contains(element2.parent().parent())) {
                hashSet.add(element2.parent().parent());
            }
            i++;
            i2++;
        }
        int i3 = 0;
        for (Element element3 : hashSet) {
            int score = getScore(element3);
            if (score > i3) {
                element = element3;
                i3 = score;
            }
            if (element == null) {
                element = element3;
            }
        }
        return element;
    }

    private Element cleanupNode(Element element) {
        Element addSiblings = addSiblings(element);
        Iterator<Element> it = addSiblings.children().iterator();
        while (it.hasNext()) {
            Element next = it.next();
            if (!next.tagName().equals("p")) {
                if (isHighLinkDensity(next)) {
                    next.remove();
                } else {
                    Iterator<Element> it2 = next.getElementsByTag("p").iterator();
                    while (it2.hasNext()) {
                        Element next2 = it2.next();
                        if (next2.text().length() < 25) {
                            next2.remove();
                        }
                    }
                    if (next.getElementsByTag("p").size() != 0 || next.tagName().equals("td")) {
                        if (getScore(next) < ((float) (getScore(addSiblings) * 0.08d)) && !next.tagName().equals("td")) {
                            next.remove();
                        }
                    } else {
                        next.remove();
                    }
                }
            }
        }
        return addSiblings;
    }

    private String doTitleSplits(String str, StringSplitter stringSplitter) {
        int i = 0;
        int i2 = 0;
        String[] split = stringSplitter.split(str);
        for (int i3 = 0; i3 < split.length; i3++) {
            String str2 = split[i3];
            if (str2.length() > i) {
                i = str2.length();
                i2 = i3;
            }
        }
        return TITLE_REPLACEMENTS.replaceAll(split[i2]).trim();
    }

    public static String escapeHtml(String str) {
        if (str == null) {
            return null;
        }
        return Entities.HTML40.escape(str);
    }

    private Set<String> extractTags(Element element) {
        if (element.children().size() == 0) {
            return NO_STRINGS;
        }
        Elements select = Selector.select(A_REL_TAG_SELECTOR, element);
        if (select.size() == 0) {
            return NO_STRINGS;
        }
        HashSet hashSet = new HashSet(select.size());
        Iterator<Element> it = select.iterator();
        while (it.hasNext()) {
            String text = it.next().text();
            if (!string.isNullOrEmpty(text)) {
                hashSet.add(text);
            }
        }
        return hashSet;
    }

    private ArrayList<Element> extractVideos(Element element) {
        ArrayList arrayList = new ArrayList();
        ArrayList<Element> arrayList2 = new ArrayList<>();
        try {
            Iterator<Element> it = element.parent().getElementsByTag("embed").iterator();
            while (it.hasNext()) {
                arrayList.add(it.next());
            }
            Iterator<Element> it2 = element.parent().getElementsByTag("object").iterator();
            while (it2.hasNext()) {
                arrayList.add(it2.next());
            }
            Iterator it3 = arrayList.iterator();
            while (it3.hasNext()) {
                Element element2 = (Element) it3.next();
                Iterator<Attribute> it4 = element2.attributes().iterator();
                while (it4.hasNext()) {
                    Attribute next = it4.next();
                    try {
                        if (next.getValue().contains("youtube") || next.getValue().contains("vimeo")) {
                            if (next.getKey().equals("src")) {
                                arrayList2.add(element2);
                            }
                        }
                    } catch (Exception e) {
                    }
                }
            }
        } catch (Exception e2) {
        }
        return arrayList2;
    }

    private int getBaselineScoreForSiblings(Element element) {
        int i = 0;
        int i2 = 0;
        Iterator<Element> it = element.getElementsByTag("p").iterator();
        while (it.hasNext()) {
            Element next = it.next();
            WordStats stopWordCount = StopWords.getStopWordCount(next.text());
            boolean isHighLinkDensity = isHighLinkDensity(next);
            if (stopWordCount.getStopWordCount() > 2 && !isHighLinkDensity) {
                i++;
                i2 += stopWordCount.getStopWordCount();
            }
        }
        if (i > 0) {
            return i2 / i;
        }
        return 100000;
    }

    private String getCanonicalLink(Document document, String str) {
        Elements select = document.select("link[rel=canonical]");
        if (select.size() <= 0) {
            return str;
        }
        String attr = select.first().attr("href");
        return string.isNullOrEmpty(attr) ? "" : attr.trim();
    }

    private DocumentCleaner getDocCleaner() {
        if (this.documentCleaner == null) {
            this.documentCleaner = new DefaultDocumentCleaner();
        }
        return this.documentCleaner;
    }

    private String getDomain(String str) {
        try {
            return new URL(str).getHost();
        } catch (MalformedURLException e) {
            throw new RuntimeException(e);
        }
    }

    private ImageExtractor getImageExtractor(String str) {
        return this.imageExtractor == null ? new BestImageGuesser(this.config, str) : this.imageExtractor;
    }

    private String getMetaContent(Document document, String str) {
        Elements select = document.select(str);
        if (select.size() <= 0) {
            return "";
        }
        String attr = select.first().attr(FirebaseAnalytics.Param.CONTENT);
        return string.isNullOrEmpty(attr) ? "" : attr.trim();
    }

    private String getMetaDescription(Document document) {
        return getMetaContent(document, "meta[name=description]");
    }

    private String getMetaKeywords(Document document) {
        return getMetaContent(document, "meta[name=keywords]");
    }

    private ArrayList<Element> getNodesToCheck(Document document) {
        ArrayList<Element> arrayList = new ArrayList<>();
        arrayList.addAll(document.getElementsByTag("p"));
        arrayList.addAll(document.getElementsByTag("pre"));
        arrayList.addAll(document.getElementsByTag("td"));
        return arrayList;
    }

    private OutputFormatter getOutputFormatter() {
        return this.outputFormatter == null ? new DefaultOutputFormatter() : this.outputFormatter;
    }

    private int getScore(Element element) {
        if (element == null) {
            return 0;
        }
        try {
            String attr = element.attr("gravityScore");
            if (string.isNullOrEmpty(attr)) {
                return 0;
            }
            return Integer.parseInt(attr);
        } catch (NumberFormatException e) {
            return 0;
        }
    }

    private String getTitle(Document document) {
        Elements elementsByTag;
        String str = "";
        try {
            elementsByTag = document.getElementsByTag("title");
        } catch (NullPointerException e) {
        }
        if (elementsByTag == null || elementsByTag.isEmpty()) {
            return "";
        }
        String text = elementsByTag.first().text();
        if (string.isNullOrEmpty(text)) {
            return "";
        }
        boolean z = false;
        if (text.contains("|")) {
            text = doTitleSplits(text, PIPE_SPLITTER);
            z = true;
        }
        if (!z && text.contains("-")) {
            text = doTitleSplits(text, DASH_SPLITTER);
            z = true;
        }
        if (!z && text.contains("»")) {
            text = doTitleSplits(text, ARROWS_SPLITTER);
            z = true;
        }
        if (!z && text.contains(":")) {
            text = doTitleSplits(text, COLON_SPLITTER);
        }
        str = MOTLEY_REPLACEMENT.replaceAll(escapeHtml(text));
        return str;
    }

    private String getUrlToCrawl(String str) {
        return str.contains("#!") ? ESCAPED_FRAGMENT_REPLACEMENT.replaceAll(str) : str;
    }

    private static boolean isHighLinkDensity(Element element) {
        Elements elementsByTag = element.getElementsByTag("a");
        if (elementsByTag.size() == 0) {
            return false;
        }
        float length = SPACE_SPLITTER.split(element.text().trim()).length;
        StringBuilder sb = new StringBuilder();
        Iterator<Element> it = elementsByTag.iterator();
        while (it.hasNext()) {
            sb.append(it.next().text());
        }
        return (((float) SPACE_SPLITTER.split(sb.toString()).length) / length) * ((float) elementsByTag.size()) > 1.0f;
    }

    private boolean isOkToBoost(Element element) {
        int i = 0;
        for (Element nextElementSibling = element.nextElementSibling(); nextElementSibling != null; nextElementSibling = nextElementSibling.nextElementSibling()) {
            if (nextElementSibling.tagName().equals("p")) {
                if (i >= 3) {
                    return false;
                }
                if (StopWords.getStopWordCount(nextElementSibling.text()).getStopWordCount() > 5) {
                    return true;
                }
            }
            i++;
        }
        return false;
    }

    private static String md5(String str) {
        try {
            MessageDigest messageDigest = MessageDigest.getInstance("MD5");
            messageDigest.update(str.getBytes());
            return bytesToLowerCaseHex(messageDigest.digest());
        } catch (NoSuchAlgorithmException e) {
            throw new RuntimeException(e);
        }
    }

    private Article performExtraction(String str, String str2) {
        String urlToCrawl = getUrlToCrawl(str);
        try {
            new URL(urlToCrawl);
            this.linkHash = md5(urlToCrawl);
            ParseWrapper parseWrapper = new ParseWrapper();
            if (str2 == null) {
                try {
                    str2 = GooseDownloader.getHtml(urlToCrawl, true);
                } catch (Exception e) {
                    return null;
                }
            }
            Article article = new Article();
            try {
                article.setRawHtml(str2);
                Document parse = parseWrapper.parse(str2, urlToCrawl);
                article.setPublishDate(this.config.getPublishDateExtractor().extract((Element) parse));
                article.setAdditionalData(this.config.getAdditionalDataExtractor().extract((Element) parse));
                article.setTags(extractTags(parse));
                Document clean = getDocCleaner().clean(parse);
                article.setTitle(getTitle(clean));
                article.setMetaDescription(getMetaDescription(clean));
                article.setMetaKeywords(getMetaKeywords(clean));
                article.setCanonicalLink(getCanonicalLink(clean, urlToCrawl));
                article.setDomain(article.getCanonicalLink());
                article.setTopNode(calculateBestNodeBasedOnClustering(clean));
                if (article.getTopNode() != null) {
                    article.setMovies(extractVideos(article.getTopNode()));
                    if (this.config.isEnableImageFetching()) {
                        this.imageExtractor = getImageExtractor(urlToCrawl);
                        article.setTopImage(this.imageExtractor.getBestImage(clean, article.getTopNode()));
                    }
                    cleanupNode(article.getTopNode());
                    this.outputFormatter = getOutputFormatter();
                    article.setCleanedArticleText(this.outputFormatter.getFormattedText(article.getTopNode()));
                }
                releaseResources();
                return article;
            } catch (Exception e2) {
                return article;
            }
        } catch (MalformedURLException e3) {
            throw new IllegalArgumentException("Invalid URL Passed in: " + urlToCrawl, e3);
        }
    }

    private void updateNodeCount(Element element, int i) {
        int i2;
        try {
            String attr = element.attr("gravityNodes");
            i2 = string.isNullOrEmpty(attr) ? 0 : Integer.parseInt(attr);
        } catch (NumberFormatException e) {
            i2 = 0;
        }
        element.attr("gravityNodes", Integer.toString(i2 + i));
    }

    private void updateScore(Element element, int i) {
        int i2;
        try {
            String attr = element.attr("gravityScore");
            i2 = string.isNullOrEmpty(attr) ? 0 : Integer.parseInt(attr);
        } catch (NumberFormatException e) {
            i2 = 0;
        }
        element.attr("gravityScore", Integer.toString(i2 + i));
    }

    public Article extractContent(String str) {
        return performExtraction(str, null);
    }

    public Article extractContent(String str, String str2) {
        return performExtraction(str, str2);
    }

    public void releaseResources() {
        File file = new File(this.config.getCacheDirectory());
        String[] list = file.list();
        if (list != null) {
            for (String str : list) {
                if (str.startsWith(this.linkHash) && !new File(file.getAbsolutePath() + "/" + str).delete()) {
                    Log.e(ContentExtractor.class.getName(), "Unable to remove temp file: " + str);
                }
            }
        }
    }
}
