SinglePageCrawler.java

/*
 * html-crawler is a library to help crawling websites.
 * Copyright (C) 2020  Uwe Plonus
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */

package org.sw4j.tool.html.crawler;

import java.io.IOException;
import java.io.Reader;
import java.io.StringWriter;
import java.util.LinkedList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

public class SinglePageCrawler {

    private final List<ExternalReferenceListener> aListener;

    private final List<ExternalReferenceListener> areaListener;

    private final List<ExternalReferenceListener> audioListener;

    private final List<ExternalReferenceListener> baseListener;

    private final List<ExternalReferenceListener> blockquoteListener;

    private final List<ExternalReferenceListener> buttonListener;

    private final List<ExternalReferenceListener> delListener;

    private final List<ExternalReferenceListener> embedListener;

    private final List<ExternalReferenceListener> formListener;

    private final List<ExternalReferenceListener> iframeListener;

    private final List<ExternalReferenceListener> imgListener;

    private final List<ExternalReferenceListener> inputListener;

    private final List<ExternalReferenceListener> insListener;

    private final List<ExternalReferenceListener> itemListener;

    private final List<ExternalReferenceListener> linkListener;

    private final List<ExternalReferenceListener> manifestListener;

    private final List<ExternalReferenceListener> objectListener;

    private final List<ExternalReferenceListener> qListener;

    private final List<ExternalReferenceListener> scriptListener;

    private final List<ExternalReferenceListener> sourceListener;

    private final List<ExternalReferenceListener> trackListener;

    private final List<ExternalReferenceListener> videoListener;

    private SinglePageCrawler(final List<ExternalReferenceListener> aListener,
            final List<ExternalReferenceListener> areaListener,
            final List<ExternalReferenceListener> audioListener,
            final List<ExternalReferenceListener> baseListener,
            final List<ExternalReferenceListener> blockquoteListener,
            final List<ExternalReferenceListener> buttonListener,
            final List<ExternalReferenceListener> embedListener,
            final List<ExternalReferenceListener> delListener,
            final List<ExternalReferenceListener> formListener,
            final List<ExternalReferenceListener> iframeListener,
            final List<ExternalReferenceListener> imgListener,
            final List<ExternalReferenceListener> inputListener,
            final List<ExternalReferenceListener> insListener,
            final List<ExternalReferenceListener> itemListener,
            final List<ExternalReferenceListener> linkListener,
            final List<ExternalReferenceListener> manifestListener,
            final List<ExternalReferenceListener> objectListener,
            final List<ExternalReferenceListener> qListener,
            final List<ExternalReferenceListener> scriptListener,
            final List<ExternalReferenceListener> sourceListener,
            final List<ExternalReferenceListener> trackListener,
            final List<ExternalReferenceListener> videoListener) {
        this.aListener = aListener;
        this.areaListener = areaListener;
        this.audioListener = audioListener;
        this.baseListener = baseListener;
        this.blockquoteListener = blockquoteListener;
        this.buttonListener = buttonListener;
        this.delListener = delListener;
        this.embedListener = embedListener;
        this.formListener = formListener;
        this.iframeListener = iframeListener;
        this.imgListener = imgListener;
        this.inputListener = inputListener;
        this.insListener = insListener;
        this.itemListener = itemListener;
        this.linkListener = linkListener;
        this.manifestListener = manifestListener;
        this.objectListener = objectListener;
        this.qListener = qListener;
        this.scriptListener = scriptListener;
        this.sourceListener = sourceListener;
        this.trackListener = trackListener;
        this.videoListener = videoListener;
    }

    public static Builder builder() {
        return new BuilderImpl();
    }

    public void parse(String content, String baseUri) {
        Document document = Jsoup.parse(content);
        for (Element child: document.head().children()) {
            if ("link".equals(child.nodeName())) {
                handleLink(child);
            }
        }
        for (Element child: document.body().children()) {
        }
    }

    public void parse(Reader content, String baseUri) throws IOException {
        StringWriter stringWriter = new StringWriter();
        content.transferTo(stringWriter);
        parse(stringWriter.toString(), baseUri);
    }

    private void handleLink(Element link) {
        Attributes attributes = link.attributes();
        String[] rels = attributes.getIgnoreCase("rel").split("\\s");
        boolean alternate = false;
        for (String rel: rels) {
            switch (rel) {
                case "alternate":
                    alternate = true;
                    break;
                case "stylesheet":
                    String href = attributes.get("href");
                    RelationType type = RelationType.STYLESHEET;
                    if (alternate) {
                        type = RelationType.ALTERNATE_STYLESHEET;
                        alternate = false;
                    }
                    ExternalReferenceEvent event = new ExternalReferenceEvent(link.tagName(), LinkType.EXTERNAL_RESOURCE,
                            href, type);
                    for (ExternalReferenceListener listener : this.linkListener) {
                        listener.link(event);
                    }
                    break;
                default:
                    break;
            }
        }
    }


    public interface Builder {

        /**
         * &lt;a href ping&gt;
         *
         * @param listener
         * @return
         */
        Builder addAListener(ExternalReferenceListener listener);

        /**
         * &lt;area href&gt;
         *
         * @param listener
         * @return
         */
        Builder addAreaListener(ExternalReferenceListener listener);

        /**
         * &lt;audio src&gt;
         *
         * @param listener
         * @return
         */
        Builder addAudioListener(ExternalReferenceListener listener);

        /**
         * &lt;base href&gt;
         *
         * @param listener
         * @return
         */
        Builder addBaseListener(ExternalReferenceListener listener);

        /**
         * &lt;blockquote cite&gt;
         *
         * @param listener
         * @return
         */
        Builder addBlockquoteListener(ExternalReferenceListener listener);

        /**
         * &lt;button formaction&gt;
         *
         * @param listener
         * @return
         */
        Builder addButtonListener(ExternalReferenceListener listener);

        /**
         * &lt;del cite&gt;
         *
         * @param listener
         * @return
         */
        Builder addDelListener(ExternalReferenceListener listener);

        /**
         * &lt;embed src&gt;
         *
         * @param listener
         * @return
         */
        Builder addEmbedListener(ExternalReferenceListener listener);

        /**
         * &lt;form action&gt;
         *
         * @param listener
         * @return
         */
        Builder addFormListener(ExternalReferenceListener listener);

        /**
         * &lt;iframe src&gt;
         *
         * @param listener
         * @return
         */
        Builder addIframeListener(ExternalReferenceListener listener);

        /**
         * &lt;img src srcset&gt;
         *
         * @param listener
         * @return
         */
        Builder addImgListener(ExternalReferenceListener listener);

        /**
         * &lt;input src&gt;
         * &lt;input formaction&gt;
         *
         * @param listener
         * @return
         */
        Builder addInputListener(ExternalReferenceListener listener);

        /**
         * &lt;ins cite&gt;
         *
         * @param listener
         * @return
         */
        Builder addInsListener(ExternalReferenceListener listener);

        /**
         *
         * &lt;... itemid itemprop itemtype&gt;
         *
         * @param listener
         * @return
         */
        Builder addItemListener(ExternalReferenceListener listener);

        /**
         * &lt;link href imagesrcset&gt;
         *
         * @param listener
         * @return
         */
        Builder addLinkListener(ExternalReferenceListener listener);

        /**
         * &lt;html manifest&gt;
         *
         * @param listener
         * @return
         */
        Builder addManifestListener(ExternalReferenceListener listener);

        /**
         * &lt;object data&gt;
         *
         * @param listener
         * @return
         */
        Builder addObjectListener(ExternalReferenceListener listener);

        /**
         * &lt;q cite&gt;
         *
         * @param listener
         * @return
         */
        Builder addQListener(ExternalReferenceListener listener);

        /**
         * &lt;script src&gt;
         *
         * @param listener
         * @return
         */
        Builder addScriptListener(ExternalReferenceListener listener);

        /**
         * &lt;source src srcset&gt;
         *
         * @param listener
         * @return
         */
        Builder addSourceListener(ExternalReferenceListener listener);

        /**
         * &lt;track src&gt;
         *
         * @param listener
         * @return
         */
        Builder addTrackListener(ExternalReferenceListener listener);

        /**
         * &lt;video poster src&gt;
         *
         * @param listener
         * @return
         */
        Builder addVideoListener(ExternalReferenceListener listener);

        SinglePageCrawler build();

    }


    private static class BuilderImpl implements Builder {

        private final List<ExternalReferenceListener> aListener = new LinkedList<>();

        private final List<ExternalReferenceListener> areaListener = new LinkedList<>();

        private final List<ExternalReferenceListener> audioListener = new LinkedList<>();

        private final List<ExternalReferenceListener> baseListener = new LinkedList<>();

        private final List<ExternalReferenceListener> blockquoteListener = new LinkedList<>();

        private final List<ExternalReferenceListener> buttonListener = new LinkedList<>();

        private final List<ExternalReferenceListener> delListener = new LinkedList<>();

        private final List<ExternalReferenceListener> embedListener = new LinkedList<>();

        private final List<ExternalReferenceListener> formListener = new LinkedList<>();

        private final List<ExternalReferenceListener> iframeListener = new LinkedList<>();

        private final List<ExternalReferenceListener> imgListener = new LinkedList<>();

        private final List<ExternalReferenceListener> inputListener = new LinkedList<>();

        private final List<ExternalReferenceListener> insListener = new LinkedList<>();

        private final List<ExternalReferenceListener> itemListener = new LinkedList<>();

        private final List<ExternalReferenceListener> linkListener = new LinkedList<>();

        private final List<ExternalReferenceListener> manifestListener = new LinkedList<>();

        private final List<ExternalReferenceListener> objectListener = new LinkedList<>();

        private final List<ExternalReferenceListener> qListener = new LinkedList<>();

        private final List<ExternalReferenceListener> scriptListener = new LinkedList<>();

        private final List<ExternalReferenceListener> sourceListener = new LinkedList<>();

        private final List<ExternalReferenceListener> trackListener = new LinkedList<>();

        private final List<ExternalReferenceListener> videoListener = new LinkedList<>();

        private BuilderImpl() {}

        @Override
        public Builder addAListener(final ExternalReferenceListener listener) {
            this.aListener.add(listener);
            return this;
        }

        @Override
        public Builder addAreaListener(final ExternalReferenceListener listener) {
            this.areaListener.add(listener);
            return this;
        }

        @Override
        public Builder addAudioListener(final ExternalReferenceListener listener) {
            this.audioListener.add(listener);
            return this;
        }

        @Override
        public Builder addBaseListener(final ExternalReferenceListener listener) {
            this.baseListener.add(listener);
            return this;
        }

        @Override
        public Builder addBlockquoteListener(final ExternalReferenceListener listener) {
            this.blockquoteListener.add(listener);
            return this;
        }

        @Override
        public Builder addButtonListener(final ExternalReferenceListener listener) {
            this.buttonListener.add(listener);
            return this;
        }

        @Override
        public Builder addDelListener(final ExternalReferenceListener listener) {
            this.delListener.add(listener);
            return this;
        }

        @Override
        public Builder addEmbedListener(final ExternalReferenceListener listener) {
            this.embedListener.add(listener);
            return this;
        }

        @Override
        public Builder addIframeListener(final ExternalReferenceListener listener) {
            this.iframeListener.add(listener);
            return this;
        }

        @Override
        public Builder addImgListener(final ExternalReferenceListener listener) {
            this.imgListener.add(listener);
            return this;
        }

        @Override
        public Builder addInputListener(final ExternalReferenceListener listener) {
            this.inputListener.add(listener);
            return this;
        }

        @Override
        public Builder addInsListener(final ExternalReferenceListener listener) {
            this.insListener.add(listener);
            return this;
        }

        @Override
        public Builder addItemListener(final ExternalReferenceListener listener) {
            this.itemListener.add(listener);
            return this;
        }

        @Override
        public Builder addLinkListener(final ExternalReferenceListener listener) {
            this.linkListener.add(listener);
            return this;
        }

        @Override
        public Builder addManifestListener(final ExternalReferenceListener listener) {
            this.manifestListener.add(listener);
            return this;
        }

        @Override
        public Builder addObjectListener(final ExternalReferenceListener listener) {
            this.objectListener.add(listener);
            return this;
        }

        @Override
        public Builder addQListener(final ExternalReferenceListener listener) {
            this.qListener.add(listener);
            return this;
        }

        @Override
        public Builder addScriptListener(final ExternalReferenceListener listener) {
            this.scriptListener.add(listener);
            return this;
        }

        @Override
        public Builder addSourceListener(final ExternalReferenceListener listener) {
            this.sourceListener.add(listener);
            return this;
        }

        @Override
        public Builder addTrackListener(final ExternalReferenceListener listener) {
            this.trackListener.add(listener);
            return this;
        }

        @Override
        public Builder addVideoListener(final ExternalReferenceListener listener) {
            this.videoListener.add(listener);
            return this;
        }

        @Override
        public Builder addFormListener(final ExternalReferenceListener listener) {
            this.formListener.add(listener);
            return this;
        }

        public SinglePageCrawler build() {
            return new SinglePageCrawler(this.aListener, this.areaListener, this.audioListener, this.baseListener,
                    this.blockquoteListener, this.buttonListener, this.delListener, this.embedListener,
                    this.formListener, this.iframeListener, this.imgListener, this.inputListener, this.insListener,
                    this.itemListener, this.linkListener, this.manifestListener, this.objectListener, this.qListener,
                    this.scriptListener, this.sourceListener, this.trackListener, this.videoListener);
        }

    }

}