BrightSide Workbench Full Report + Source Code
CrawlerOptions.java
Go to the documentation of this file.
1 /*
2  * TurrĂ³ i Cutiller Foundation. License notice.
3  * Copyright (C) 2022 Lluis TurrĂ³ Cutiller <http://www.turro.org/>
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Affero General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Affero General Public License for more details.
14  *
15  * You should have received a copy of the GNU Affero General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 package org.turro.elephant.servlet.crawler;
20 
21 import java.io.IOException;
22 import java.util.HashSet;
23 import java.util.Map;
24 import java.util.Set;
25 import java.util.function.Consumer;
26 import java.util.logging.Level;
27 import java.util.logging.Logger;
28 import org.turro.elephant.context.ElephantContext;
29 import org.turro.file.Document;
30 import org.turro.http.activity.ActivityAction;
31 import org.turro.http.activity.IActivityOptions;
32 import org.turro.json.IJSONizable;
33 import org.turro.lock.Executable;
34 import org.turro.string.Strings;
35 
40 public class CrawlerOptions implements IActivityOptions, IJSONizable {
41 
42  private boolean active;
43 
44  private int maxMinutes;
45  private int maxRequests;
46 
47  private final Set<String> badRequest = new HashSet<>();
48  private final Set<String> badAgent = new HashSet<>();
49  private final Set<String> allowedDomains = new HashSet<>();
50 
51  private transient AllowedSet allowedSet;
52 
53  // To remove in future releases
54  private Set<String> whiteList;
55  private Set<String> blackList;
56 
57  @Override
58  public boolean isActive() {
59  return active;
60  }
61 
62  @Override
63  public int maxMinutes() {
64  return maxMinutes;
65  }
66 
67  @Override
68  public int maxRequests() {
69  return maxRequests;
70  }
71 
72  @Override
73  public boolean isWhiteListed(String ip) {
74  return allowedSet != null && allowedSet.isWhiteListed(ip);
75  }
76 
77  @Override
78  public boolean isBlackListed(String ip) {
79  return allowedSet != null && allowedSet.isBlackListed(ip);
80  }
81 
82  @Override
83  public boolean isBadRequest(String request) {
84  return request == null || badRequest.stream().anyMatch(br -> request.matches(br));
85  }
86 
87  @Override
88  public boolean isBadAgent(String agent) {
89  return agent == null || badAgent.stream().anyMatch(br -> agent.matches(br));
90  }
91 
92  @Override
93  public boolean isAllowedDomain(String domain) {
94  return !Strings.isBlank(domain) && allowedDomains.stream().anyMatch(wd -> domain.endsWith(wd));
95  }
96 
97  @Override
98  public void whiteList(ActivityAction action) {
99  CrawlerLists cl = loadLists();
100  cl.getWhiteList().add(action);
101  saveLists(cl);
102  setAllowedFrom(cl);
103  }
104 
105  @Override
106  public void blackList(ActivityAction action) {
107  CrawlerLists cl = loadLists();
108  cl.getBlackList().add(action);
109  saveLists(cl);
110  setAllowedFrom(cl);
111  }
112 
113  @Override
114  public Consumer<ActivityAction> onWhiteList() {
115  return (action) -> {
116 // MailSenders.getPool()
117 // .addAdministrators()
118 // .send("Whitelisted",
119 // Phrases.start("Whitelisted", action.getIp(), ":", action.getDomain())
120 // .line().add(action.getReason()).toString());
121  };
122  }
123 
124  @Override
125  public Consumer<ActivityAction> onBlackList() {
126  return (action) -> {
127 // MailSenders.getPool()
128 // .addAdministrators()
129 // .send("Blacklisted",
130 // Phrases.start("Blacklisted", action.getIp(), ":", action.getDomain())
131 // .line().add(action.getReason()).toString());
132  };
133  }
134 
135  /* Factory */
136 
137  public final static transient String CRAWLERS_RESPONSE = "/WEB-INF/elephant/activity/crawlers.html";
138 
139  public static CrawlerOptions instance() {
140  return load();
141  }
142 
143  /* Storage */
144 
145  private final static transient String
146  CRAWLERS_FILE = "/WEB-INF/elephant/activity/crawlers.json",
147  CRAWLERS_LIST = "/WEB-INF/elephant/activity/crawlers-lists.json";
148 
149  private CrawlerLists loadLists() {
150  Document crawlers = Document.from(ElephantContext.getRealPath(CRAWLERS_LIST));
151  if(crawlers.exists()) {
152  try {
153  CrawlerLists cl = IJSONizable.fromJson(
154  crawlers.content(), CrawlerLists.class);
155  if(whiteList != null && blackList != null && (!whiteList.isEmpty() || !blackList.isEmpty())) {
156  whiteList.forEach(ip -> cl.getWhiteList().add(new ActivityAction(ip, ip, "")));
157  blackList.forEach(ip -> cl.getBlackList().add(new ActivityAction(ip, ip, "")));
158  whiteList = null;
159  blackList = null;
160  saveDefinitions();
161  saveLists(cl);
162  }
163  return cl;
164  } catch (Exception ex) {
165  crawlers.delete();
166  Logger.getLogger(CrawlerOptions.class.getName()).log(Level.SEVERE, null, ex);
167  }
168  }
169  return new CrawlerLists();
170  }
171 
172  private void setAllowedFrom(CrawlerLists cl) {
173  allowedSet = cl.getAllowedSet();
174  }
175 
176  private void saveDefinitions() {
177  save(this, CRAWLERS_FILE);
178  }
179 
180  private void saveLists(CrawlerLists cl) {
181  save(cl, CRAWLERS_LIST);
182  }
183 
184  private static final Executable LOCK = new Executable();
185 
186  private void save(IJSONizable jSONizable, String file) {
187  LOCK.run(() -> {
188  Document crawlers = Document.from(ElephantContext.getRealPath(file));
189  try {
190  crawlers.content(jSONizable.toJson());
191  } catch (IOException ex) {
192  Logger.getLogger(CrawlerOptions.class.getName()).log(Level.SEVERE, null, ex);
193  }
194  });
195  }
196 
197  private static CrawlerOptions load() {
198  Document crawlers = Document.from(ElephantContext.getRealPath(CRAWLERS_FILE));
199  if(crawlers.exists()) {
200  try {
201  CrawlerOptions co = IJSONizable.fromJson(
202  crawlers.content(), CrawlerOptions.class);
203  addDefaults(co);
204  co.setAllowedFrom(co.loadLists());
205  return co;
206  } catch (Exception ex) {
207  crawlers.delete();
208  Logger.getLogger(CrawlerOptions.class.getName()).log(Level.SEVERE, null, ex);
209  }
210  }
211  return createDefaults();
212  }
213 
214  private static CrawlerOptions createDefaults() {
215  CrawlerOptions co = new CrawlerOptions();
216  co.active = true;
217  co.maxMinutes = 15;
218  co.maxRequests = 15 * 60 * 2;
219  addDefaults(co);
220  return co;
221  }
222 
223  private static CrawlerOptions addDefaults(CrawlerOptions co) {
224  co.allowedDomains.add(".crawl.baidu.com");
225  co.allowedDomains.add(".crawl.baidu.jp");
226  co.allowedDomains.add(".search.msn.com");
227  co.allowedDomains.add(".google.com");
228  co.allowedDomains.add(".googlebot.com");
229  co.allowedDomains.add(".crawl.yahoo.net");
230  co.allowedDomains.add(".yandex.com");
231  co.allowedDomains.add(".yandex.net");
232  co.allowedDomains.add(".yandex.ru");
233  co.badRequest.add(".*index.(php|asp).*");
234  co.badRequest.add(".*phpmyadmin.*");
235  co.badRequest.add(".*phpstorm.*");
236  co.badRequest.add(".*phpunit.*");
237  co.badRequest.add(".*crypto/\\.env.*");
238  co.badRequest.add(".*invokefunction.*");
239  co.badRequest.add(".*aws/credentials.*");
240  co.badRequest.add(".*git/credentials.*");
241  co.badRequest.add(".*Dockerrun.aws.json.*");
242  co.badRequest.add(".*/cgi-bin/.*");
243  co.badRequest.add(".*\\.well-known/security.txt.*");
244  co.badAgent.add(".*MtmKilledYou.*");
245  return co;
246  }
247 
248  /* IJSONizable */
249 
250  @Override
251  public String toJson() {
252  return toJson(this);
253  }
254 
255  @Override
256  public String toJson(Map<String, Object> properties) {
257  return toJson(this, properties);
258  }
259 
260 }
String toJson(Map< String, Object > properties)