View Javadoc
1   /*
2    * html-crawler is a library to help crawling websites.
3    * Copyright (C) 2020  Uwe Plonus
4    *
5    * This program is free software: you can redistribute it and/or modify
6    * it under the terms of the GNU Affero General Public License as published by
7    * the Free Software Foundation, either version 3 of the License, or
8    * (at your option) any later version.
9    *
10   * This program is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Affero General Public License for more details.
14   *
15   * You should have received a copy of the GNU Affero General Public License
16   * along with this program.  If not, see <https://www.gnu.org/licenses/>.
17   */
18  
19  package org.sw4j.tool.html.crawler;
20  
21  import java.io.IOException;
22  import java.io.Reader;
23  import java.io.StringWriter;
24  import java.util.LinkedList;
25  import java.util.List;
26  import org.jsoup.Jsoup;
27  import org.jsoup.nodes.Attributes;
28  import org.jsoup.nodes.Document;
29  import org.jsoup.nodes.Element;
30  
31  public class SinglePageCrawler {
32  
33      private final List<ExternalReferenceListener> aListener;
34  
35      private final List<ExternalReferenceListener> areaListener;
36  
37      private final List<ExternalReferenceListener> audioListener;
38  
39      private final List<ExternalReferenceListener> baseListener;
40  
41      private final List<ExternalReferenceListener> blockquoteListener;
42  
43      private final List<ExternalReferenceListener> buttonListener;
44  
45      private final List<ExternalReferenceListener> delListener;
46  
47      private final List<ExternalReferenceListener> embedListener;
48  
49      private final List<ExternalReferenceListener> formListener;
50  
51      private final List<ExternalReferenceListener> iframeListener;
52  
53      private final List<ExternalReferenceListener> imgListener;
54  
55      private final List<ExternalReferenceListener> inputListener;
56  
57      private final List<ExternalReferenceListener> insListener;
58  
59      private final List<ExternalReferenceListener> itemListener;
60  
61      private final List<ExternalReferenceListener> linkListener;
62  
63      private final List<ExternalReferenceListener> manifestListener;
64  
65      private final List<ExternalReferenceListener> objectListener;
66  
67      private final List<ExternalReferenceListener> qListener;
68  
69      private final List<ExternalReferenceListener> scriptListener;
70  
71      private final List<ExternalReferenceListener> sourceListener;
72  
73      private final List<ExternalReferenceListener> trackListener;
74  
75      private final List<ExternalReferenceListener> videoListener;
76  
77      private SinglePageCrawler(final List<ExternalReferenceListener> aListener,
78              final List<ExternalReferenceListener> areaListener,
79              final List<ExternalReferenceListener> audioListener,
80              final List<ExternalReferenceListener> baseListener,
81              final List<ExternalReferenceListener> blockquoteListener,
82              final List<ExternalReferenceListener> buttonListener,
83              final List<ExternalReferenceListener> embedListener,
84              final List<ExternalReferenceListener> delListener,
85              final List<ExternalReferenceListener> formListener,
86              final List<ExternalReferenceListener> iframeListener,
87              final List<ExternalReferenceListener> imgListener,
88              final List<ExternalReferenceListener> inputListener,
89              final List<ExternalReferenceListener> insListener,
90              final List<ExternalReferenceListener> itemListener,
91              final List<ExternalReferenceListener> linkListener,
92              final List<ExternalReferenceListener> manifestListener,
93              final List<ExternalReferenceListener> objectListener,
94              final List<ExternalReferenceListener> qListener,
95              final List<ExternalReferenceListener> scriptListener,
96              final List<ExternalReferenceListener> sourceListener,
97              final List<ExternalReferenceListener> trackListener,
98              final List<ExternalReferenceListener> videoListener) {
99          this.aListener = aListener;
100         this.areaListener = areaListener;
101         this.audioListener = audioListener;
102         this.baseListener = baseListener;
103         this.blockquoteListener = blockquoteListener;
104         this.buttonListener = buttonListener;
105         this.delListener = delListener;
106         this.embedListener = embedListener;
107         this.formListener = formListener;
108         this.iframeListener = iframeListener;
109         this.imgListener = imgListener;
110         this.inputListener = inputListener;
111         this.insListener = insListener;
112         this.itemListener = itemListener;
113         this.linkListener = linkListener;
114         this.manifestListener = manifestListener;
115         this.objectListener = objectListener;
116         this.qListener = qListener;
117         this.scriptListener = scriptListener;
118         this.sourceListener = sourceListener;
119         this.trackListener = trackListener;
120         this.videoListener = videoListener;
121     }
122 
123     public static Builder builder() {
124         return new BuilderImpl();
125     }
126 
127     public void parse(String content, String baseUri) {
128         Document document = Jsoup.parse(content);
129         for (Element child: document.head().children()) {
130             if ("link".equals(child.nodeName())) {
131                 handleLink(child);
132             }
133         }
134         for (Element child: document.body().children()) {
135         }
136     }
137 
138     public void parse(Reader content, String baseUri) throws IOException {
139         StringWriter stringWriter = new StringWriter();
140         content.transferTo(stringWriter);
141         parse(stringWriter.toString(), baseUri);
142     }
143 
144     private void handleLink(Element link) {
145         Attributes attributes = link.attributes();
146         String[] rels = attributes.getIgnoreCase("rel").split("\\s");
147         boolean alternate = false;
148         for (String rel: rels) {
149             switch (rel) {
150                 case "alternate":
151                     alternate = true;
152                     break;
153                 case "stylesheet":
154                     String href = attributes.get("href");
155                     RelationType type = RelationType.STYLESHEET;
156                     if (alternate) {
157                         type = RelationType.ALTERNATE_STYLESHEET;
158                         alternate = false;
159                     }
160                     ExternalReferenceEventenceEvent.html#ExternalReferenceEvent">ExternalReferenceEvent event = new ExternalReferenceEvent(link.tagName(), LinkType.EXTERNAL_RESOURCE,
161                             href, type);
162                     for (ExternalReferenceListener listener : this.linkListener) {
163                         listener.link(event);
164                     }
165                     break;
166                 default:
167                     break;
168             }
169         }
170     }
171 
172 
173     public interface Builder {
174 
175         /**
176          * &lt;a href ping&gt;
177          *
178          * @param listener
179          * @return
180          */
181         Builder addAListener(ExternalReferenceListener listener);
182 
183         /**
184          * &lt;area href&gt;
185          *
186          * @param listener
187          * @return
188          */
189         Builder addAreaListener(ExternalReferenceListener listener);
190 
191         /**
192          * &lt;audio src&gt;
193          *
194          * @param listener
195          * @return
196          */
197         Builder addAudioListener(ExternalReferenceListener listener);
198 
199         /**
200          * &lt;base href&gt;
201          *
202          * @param listener
203          * @return
204          */
205         Builder addBaseListener(ExternalReferenceListener listener);
206 
207         /**
208          * &lt;blockquote cite&gt;
209          *
210          * @param listener
211          * @return
212          */
213         Builder addBlockquoteListener(ExternalReferenceListener listener);
214 
215         /**
216          * &lt;button formaction&gt;
217          *
218          * @param listener
219          * @return
220          */
221         Builder addButtonListener(ExternalReferenceListener listener);
222 
223         /**
224          * &lt;del cite&gt;
225          *
226          * @param listener
227          * @return
228          */
229         Builder addDelListener(ExternalReferenceListener listener);
230 
231         /**
232          * &lt;embed src&gt;
233          *
234          * @param listener
235          * @return
236          */
237         Builder addEmbedListener(ExternalReferenceListener listener);
238 
239         /**
240          * &lt;form action&gt;
241          *
242          * @param listener
243          * @return
244          */
245         Builder addFormListener(ExternalReferenceListener listener);
246 
247         /**
248          * &lt;iframe src&gt;
249          *
250          * @param listener
251          * @return
252          */
253         Builder addIframeListener(ExternalReferenceListener listener);
254 
255         /**
256          * &lt;img src srcset&gt;
257          *
258          * @param listener
259          * @return
260          */
261         Builder addImgListener(ExternalReferenceListener listener);
262 
263         /**
264          * &lt;input src&gt;
265          * &lt;input formaction&gt;
266          *
267          * @param listener
268          * @return
269          */
270         Builder addInputListener(ExternalReferenceListener listener);
271 
272         /**
273          * &lt;ins cite&gt;
274          *
275          * @param listener
276          * @return
277          */
278         Builder addInsListener(ExternalReferenceListener listener);
279 
280         /**
281          *
282          * &lt;... itemid itemprop itemtype&gt;
283          *
284          * @param listener
285          * @return
286          */
287         Builder addItemListener(ExternalReferenceListener listener);
288 
289         /**
290          * &lt;link href imagesrcset&gt;
291          *
292          * @param listener
293          * @return
294          */
295         Builder addLinkListener(ExternalReferenceListener listener);
296 
297         /**
298          * &lt;html manifest&gt;
299          *
300          * @param listener
301          * @return
302          */
303         Builder addManifestListener(ExternalReferenceListener listener);
304 
305         /**
306          * &lt;object data&gt;
307          *
308          * @param listener
309          * @return
310          */
311         Builder addObjectListener(ExternalReferenceListener listener);
312 
313         /**
314          * &lt;q cite&gt;
315          *
316          * @param listener
317          * @return
318          */
319         Builder addQListener(ExternalReferenceListener listener);
320 
321         /**
322          * &lt;script src&gt;
323          *
324          * @param listener
325          * @return
326          */
327         Builder addScriptListener(ExternalReferenceListener listener);
328 
329         /**
330          * &lt;source src srcset&gt;
331          *
332          * @param listener
333          * @return
334          */
335         Builder addSourceListener(ExternalReferenceListener listener);
336 
337         /**
338          * &lt;track src&gt;
339          *
340          * @param listener
341          * @return
342          */
343         Builder addTrackListener(ExternalReferenceListener listener);
344 
345         /**
346          * &lt;video poster src&gt;
347          *
348          * @param listener
349          * @return
350          */
351         Builder addVideoListener(ExternalReferenceListener listener);
352 
353         SinglePageCrawler build();
354 
355     }
356 
357 
358     private static class BuilderImpl implements Builder {
359 
360         private final List<ExternalReferenceListener> aListener = new LinkedList<>();
361 
362         private final List<ExternalReferenceListener> areaListener = new LinkedList<>();
363 
364         private final List<ExternalReferenceListener> audioListener = new LinkedList<>();
365 
366         private final List<ExternalReferenceListener> baseListener = new LinkedList<>();
367 
368         private final List<ExternalReferenceListener> blockquoteListener = new LinkedList<>();
369 
370         private final List<ExternalReferenceListener> buttonListener = new LinkedList<>();
371 
372         private final List<ExternalReferenceListener> delListener = new LinkedList<>();
373 
374         private final List<ExternalReferenceListener> embedListener = new LinkedList<>();
375 
376         private final List<ExternalReferenceListener> formListener = new LinkedList<>();
377 
378         private final List<ExternalReferenceListener> iframeListener = new LinkedList<>();
379 
380         private final List<ExternalReferenceListener> imgListener = new LinkedList<>();
381 
382         private final List<ExternalReferenceListener> inputListener = new LinkedList<>();
383 
384         private final List<ExternalReferenceListener> insListener = new LinkedList<>();
385 
386         private final List<ExternalReferenceListener> itemListener = new LinkedList<>();
387 
388         private final List<ExternalReferenceListener> linkListener = new LinkedList<>();
389 
390         private final List<ExternalReferenceListener> manifestListener = new LinkedList<>();
391 
392         private final List<ExternalReferenceListener> objectListener = new LinkedList<>();
393 
394         private final List<ExternalReferenceListener> qListener = new LinkedList<>();
395 
396         private final List<ExternalReferenceListener> scriptListener = new LinkedList<>();
397 
398         private final List<ExternalReferenceListener> sourceListener = new LinkedList<>();
399 
400         private final List<ExternalReferenceListener> trackListener = new LinkedList<>();
401 
402         private final List<ExternalReferenceListener> videoListener = new LinkedList<>();
403 
404         private BuilderImpl() {}
405 
406         @Override
407         public Builder addAListener(final ExternalReferenceListener listener) {
408             this.aListener.add(listener);
409             return this;
410         }
411 
412         @Override
413         public Builder addAreaListener(final ExternalReferenceListener listener) {
414             this.areaListener.add(listener);
415             return this;
416         }
417 
418         @Override
419         public Builder addAudioListener(final ExternalReferenceListener listener) {
420             this.audioListener.add(listener);
421             return this;
422         }
423 
424         @Override
425         public Builder addBaseListener(final ExternalReferenceListener listener) {
426             this.baseListener.add(listener);
427             return this;
428         }
429 
430         @Override
431         public Builder addBlockquoteListener(final ExternalReferenceListener listener) {
432             this.blockquoteListener.add(listener);
433             return this;
434         }
435 
436         @Override
437         public Builder addButtonListener(final ExternalReferenceListener listener) {
438             this.buttonListener.add(listener);
439             return this;
440         }
441 
442         @Override
443         public Builder addDelListener(final ExternalReferenceListener listener) {
444             this.delListener.add(listener);
445             return this;
446         }
447 
448         @Override
449         public Builder addEmbedListener(final ExternalReferenceListener listener) {
450             this.embedListener.add(listener);
451             return this;
452         }
453 
454         @Override
455         public Builder addIframeListener(final ExternalReferenceListener listener) {
456             this.iframeListener.add(listener);
457             return this;
458         }
459 
460         @Override
461         public Builder addImgListener(final ExternalReferenceListener listener) {
462             this.imgListener.add(listener);
463             return this;
464         }
465 
466         @Override
467         public Builder addInputListener(final ExternalReferenceListener listener) {
468             this.inputListener.add(listener);
469             return this;
470         }
471 
472         @Override
473         public Builder addInsListener(final ExternalReferenceListener listener) {
474             this.insListener.add(listener);
475             return this;
476         }
477 
478         @Override
479         public Builder addItemListener(final ExternalReferenceListener listener) {
480             this.itemListener.add(listener);
481             return this;
482         }
483 
484         @Override
485         public Builder addLinkListener(final ExternalReferenceListener listener) {
486             this.linkListener.add(listener);
487             return this;
488         }
489 
490         @Override
491         public Builder addManifestListener(final ExternalReferenceListener listener) {
492             this.manifestListener.add(listener);
493             return this;
494         }
495 
496         @Override
497         public Builder addObjectListener(final ExternalReferenceListener listener) {
498             this.objectListener.add(listener);
499             return this;
500         }
501 
502         @Override
503         public Builder addQListener(final ExternalReferenceListener listener) {
504             this.qListener.add(listener);
505             return this;
506         }
507 
508         @Override
509         public Builder addScriptListener(final ExternalReferenceListener listener) {
510             this.scriptListener.add(listener);
511             return this;
512         }
513 
514         @Override
515         public Builder addSourceListener(final ExternalReferenceListener listener) {
516             this.sourceListener.add(listener);
517             return this;
518         }
519 
520         @Override
521         public Builder addTrackListener(final ExternalReferenceListener listener) {
522             this.trackListener.add(listener);
523             return this;
524         }
525 
526         @Override
527         public Builder addVideoListener(final ExternalReferenceListener listener) {
528             this.videoListener.add(listener);
529             return this;
530         }
531 
532         @Override
533         public Builder addFormListener(final ExternalReferenceListener listener) {
534             this.formListener.add(listener);
535             return this;
536         }
537 
538         public SinglePageCrawler build() {
539             return new SinglePageCrawler(this.aListener, this.areaListener, this.audioListener, this.baseListener,
540                     this.blockquoteListener, this.buttonListener, this.delListener, this.embedListener,
541                     this.formListener, this.iframeListener, this.imgListener, this.inputListener, this.insListener,
542                     this.itemListener, this.linkListener, this.manifestListener, this.objectListener, this.qListener,
543                     this.scriptListener, this.sourceListener, this.trackListener, this.videoListener);
544         }
545 
546     }
547 
548 }