SinglePageCrawler.java
/*
* html-crawler is a library to help crawling websites.
* Copyright (C) 2020 Uwe Plonus
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package org.sw4j.tool.html.crawler;
import java.io.IOException;
import java.io.Reader;
import java.io.StringWriter;
import java.util.LinkedList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class SinglePageCrawler {
private final List<ExternalReferenceListener> aListener;
private final List<ExternalReferenceListener> areaListener;
private final List<ExternalReferenceListener> audioListener;
private final List<ExternalReferenceListener> baseListener;
private final List<ExternalReferenceListener> blockquoteListener;
private final List<ExternalReferenceListener> buttonListener;
private final List<ExternalReferenceListener> delListener;
private final List<ExternalReferenceListener> embedListener;
private final List<ExternalReferenceListener> formListener;
private final List<ExternalReferenceListener> iframeListener;
private final List<ExternalReferenceListener> imgListener;
private final List<ExternalReferenceListener> inputListener;
private final List<ExternalReferenceListener> insListener;
private final List<ExternalReferenceListener> itemListener;
private final List<ExternalReferenceListener> linkListener;
private final List<ExternalReferenceListener> manifestListener;
private final List<ExternalReferenceListener> objectListener;
private final List<ExternalReferenceListener> qListener;
private final List<ExternalReferenceListener> scriptListener;
private final List<ExternalReferenceListener> sourceListener;
private final List<ExternalReferenceListener> trackListener;
private final List<ExternalReferenceListener> videoListener;
private SinglePageCrawler(final List<ExternalReferenceListener> aListener,
final List<ExternalReferenceListener> areaListener,
final List<ExternalReferenceListener> audioListener,
final List<ExternalReferenceListener> baseListener,
final List<ExternalReferenceListener> blockquoteListener,
final List<ExternalReferenceListener> buttonListener,
final List<ExternalReferenceListener> embedListener,
final List<ExternalReferenceListener> delListener,
final List<ExternalReferenceListener> formListener,
final List<ExternalReferenceListener> iframeListener,
final List<ExternalReferenceListener> imgListener,
final List<ExternalReferenceListener> inputListener,
final List<ExternalReferenceListener> insListener,
final List<ExternalReferenceListener> itemListener,
final List<ExternalReferenceListener> linkListener,
final List<ExternalReferenceListener> manifestListener,
final List<ExternalReferenceListener> objectListener,
final List<ExternalReferenceListener> qListener,
final List<ExternalReferenceListener> scriptListener,
final List<ExternalReferenceListener> sourceListener,
final List<ExternalReferenceListener> trackListener,
final List<ExternalReferenceListener> videoListener) {
this.aListener = aListener;
this.areaListener = areaListener;
this.audioListener = audioListener;
this.baseListener = baseListener;
this.blockquoteListener = blockquoteListener;
this.buttonListener = buttonListener;
this.delListener = delListener;
this.embedListener = embedListener;
this.formListener = formListener;
this.iframeListener = iframeListener;
this.imgListener = imgListener;
this.inputListener = inputListener;
this.insListener = insListener;
this.itemListener = itemListener;
this.linkListener = linkListener;
this.manifestListener = manifestListener;
this.objectListener = objectListener;
this.qListener = qListener;
this.scriptListener = scriptListener;
this.sourceListener = sourceListener;
this.trackListener = trackListener;
this.videoListener = videoListener;
}
public static Builder builder() {
return new BuilderImpl();
}
public void parse(String content, String baseUri) {
Document document = Jsoup.parse(content);
for (Element child: document.head().children()) {
if ("link".equals(child.nodeName())) {
handleLink(child);
}
}
for (Element child: document.body().children()) {
}
}
public void parse(Reader content, String baseUri) throws IOException {
StringWriter stringWriter = new StringWriter();
content.transferTo(stringWriter);
parse(stringWriter.toString(), baseUri);
}
private void handleLink(Element link) {
Attributes attributes = link.attributes();
String[] rels = attributes.getIgnoreCase("rel").split("\\s");
boolean alternate = false;
for (String rel: rels) {
switch (rel) {
case "alternate":
alternate = true;
break;
case "stylesheet":
String href = attributes.get("href");
RelationType type = RelationType.STYLESHEET;
if (alternate) {
type = RelationType.ALTERNATE_STYLESHEET;
alternate = false;
}
ExternalReferenceEvent event = new ExternalReferenceEvent(link.tagName(), LinkType.EXTERNAL_RESOURCE,
href, type);
for (ExternalReferenceListener listener : this.linkListener) {
listener.link(event);
}
break;
default:
break;
}
}
}
public interface Builder {
/**
* <a href ping>
*
* @param listener
* @return
*/
Builder addAListener(ExternalReferenceListener listener);
/**
* <area href>
*
* @param listener
* @return
*/
Builder addAreaListener(ExternalReferenceListener listener);
/**
* <audio src>
*
* @param listener
* @return
*/
Builder addAudioListener(ExternalReferenceListener listener);
/**
* <base href>
*
* @param listener
* @return
*/
Builder addBaseListener(ExternalReferenceListener listener);
/**
* <blockquote cite>
*
* @param listener
* @return
*/
Builder addBlockquoteListener(ExternalReferenceListener listener);
/**
* <button formaction>
*
* @param listener
* @return
*/
Builder addButtonListener(ExternalReferenceListener listener);
/**
* <del cite>
*
* @param listener
* @return
*/
Builder addDelListener(ExternalReferenceListener listener);
/**
* <embed src>
*
* @param listener
* @return
*/
Builder addEmbedListener(ExternalReferenceListener listener);
/**
* <form action>
*
* @param listener
* @return
*/
Builder addFormListener(ExternalReferenceListener listener);
/**
* <iframe src>
*
* @param listener
* @return
*/
Builder addIframeListener(ExternalReferenceListener listener);
/**
* <img src srcset>
*
* @param listener
* @return
*/
Builder addImgListener(ExternalReferenceListener listener);
/**
* <input src>
* <input formaction>
*
* @param listener
* @return
*/
Builder addInputListener(ExternalReferenceListener listener);
/**
* <ins cite>
*
* @param listener
* @return
*/
Builder addInsListener(ExternalReferenceListener listener);
/**
*
* <... itemid itemprop itemtype>
*
* @param listener
* @return
*/
Builder addItemListener(ExternalReferenceListener listener);
/**
* <link href imagesrcset>
*
* @param listener
* @return
*/
Builder addLinkListener(ExternalReferenceListener listener);
/**
* <html manifest>
*
* @param listener
* @return
*/
Builder addManifestListener(ExternalReferenceListener listener);
/**
* <object data>
*
* @param listener
* @return
*/
Builder addObjectListener(ExternalReferenceListener listener);
/**
* <q cite>
*
* @param listener
* @return
*/
Builder addQListener(ExternalReferenceListener listener);
/**
* <script src>
*
* @param listener
* @return
*/
Builder addScriptListener(ExternalReferenceListener listener);
/**
* <source src srcset>
*
* @param listener
* @return
*/
Builder addSourceListener(ExternalReferenceListener listener);
/**
* <track src>
*
* @param listener
* @return
*/
Builder addTrackListener(ExternalReferenceListener listener);
/**
* <video poster src>
*
* @param listener
* @return
*/
Builder addVideoListener(ExternalReferenceListener listener);
SinglePageCrawler build();
}
private static class BuilderImpl implements Builder {
private final List<ExternalReferenceListener> aListener = new LinkedList<>();
private final List<ExternalReferenceListener> areaListener = new LinkedList<>();
private final List<ExternalReferenceListener> audioListener = new LinkedList<>();
private final List<ExternalReferenceListener> baseListener = new LinkedList<>();
private final List<ExternalReferenceListener> blockquoteListener = new LinkedList<>();
private final List<ExternalReferenceListener> buttonListener = new LinkedList<>();
private final List<ExternalReferenceListener> delListener = new LinkedList<>();
private final List<ExternalReferenceListener> embedListener = new LinkedList<>();
private final List<ExternalReferenceListener> formListener = new LinkedList<>();
private final List<ExternalReferenceListener> iframeListener = new LinkedList<>();
private final List<ExternalReferenceListener> imgListener = new LinkedList<>();
private final List<ExternalReferenceListener> inputListener = new LinkedList<>();
private final List<ExternalReferenceListener> insListener = new LinkedList<>();
private final List<ExternalReferenceListener> itemListener = new LinkedList<>();
private final List<ExternalReferenceListener> linkListener = new LinkedList<>();
private final List<ExternalReferenceListener> manifestListener = new LinkedList<>();
private final List<ExternalReferenceListener> objectListener = new LinkedList<>();
private final List<ExternalReferenceListener> qListener = new LinkedList<>();
private final List<ExternalReferenceListener> scriptListener = new LinkedList<>();
private final List<ExternalReferenceListener> sourceListener = new LinkedList<>();
private final List<ExternalReferenceListener> trackListener = new LinkedList<>();
private final List<ExternalReferenceListener> videoListener = new LinkedList<>();
private BuilderImpl() {}
@Override
public Builder addAListener(final ExternalReferenceListener listener) {
this.aListener.add(listener);
return this;
}
@Override
public Builder addAreaListener(final ExternalReferenceListener listener) {
this.areaListener.add(listener);
return this;
}
@Override
public Builder addAudioListener(final ExternalReferenceListener listener) {
this.audioListener.add(listener);
return this;
}
@Override
public Builder addBaseListener(final ExternalReferenceListener listener) {
this.baseListener.add(listener);
return this;
}
@Override
public Builder addBlockquoteListener(final ExternalReferenceListener listener) {
this.blockquoteListener.add(listener);
return this;
}
@Override
public Builder addButtonListener(final ExternalReferenceListener listener) {
this.buttonListener.add(listener);
return this;
}
@Override
public Builder addDelListener(final ExternalReferenceListener listener) {
this.delListener.add(listener);
return this;
}
@Override
public Builder addEmbedListener(final ExternalReferenceListener listener) {
this.embedListener.add(listener);
return this;
}
@Override
public Builder addIframeListener(final ExternalReferenceListener listener) {
this.iframeListener.add(listener);
return this;
}
@Override
public Builder addImgListener(final ExternalReferenceListener listener) {
this.imgListener.add(listener);
return this;
}
@Override
public Builder addInputListener(final ExternalReferenceListener listener) {
this.inputListener.add(listener);
return this;
}
@Override
public Builder addInsListener(final ExternalReferenceListener listener) {
this.insListener.add(listener);
return this;
}
@Override
public Builder addItemListener(final ExternalReferenceListener listener) {
this.itemListener.add(listener);
return this;
}
@Override
public Builder addLinkListener(final ExternalReferenceListener listener) {
this.linkListener.add(listener);
return this;
}
@Override
public Builder addManifestListener(final ExternalReferenceListener listener) {
this.manifestListener.add(listener);
return this;
}
@Override
public Builder addObjectListener(final ExternalReferenceListener listener) {
this.objectListener.add(listener);
return this;
}
@Override
public Builder addQListener(final ExternalReferenceListener listener) {
this.qListener.add(listener);
return this;
}
@Override
public Builder addScriptListener(final ExternalReferenceListener listener) {
this.scriptListener.add(listener);
return this;
}
@Override
public Builder addSourceListener(final ExternalReferenceListener listener) {
this.sourceListener.add(listener);
return this;
}
@Override
public Builder addTrackListener(final ExternalReferenceListener listener) {
this.trackListener.add(listener);
return this;
}
@Override
public Builder addVideoListener(final ExternalReferenceListener listener) {
this.videoListener.add(listener);
return this;
}
@Override
public Builder addFormListener(final ExternalReferenceListener listener) {
this.formListener.add(listener);
return this;
}
public SinglePageCrawler build() {
return new SinglePageCrawler(this.aListener, this.areaListener, this.audioListener, this.baseListener,
this.blockquoteListener, this.buttonListener, this.delListener, this.embedListener,
this.formListener, this.iframeListener, this.imgListener, this.inputListener, this.insListener,
this.itemListener, this.linkListener, this.manifestListener, this.objectListener, this.qListener,
this.scriptListener, this.sourceListener, this.trackListener, this.videoListener);
}
}
}