19 package org.turro.elephant.servlet.crawler;
21 import java.io.IOException;
22 import java.util.HashSet;
25 import java.util.function.Consumer;
26 import java.util.logging.Level;
27 import java.util.logging.Logger;
28 import org.turro.elephant.context.ElephantContext;
29 import org.turro.file.Document;
30 import org.turro.http.activity.ActivityAction;
31 import org.turro.http.activity.IActivityOptions;
32 import org.turro.json.IJSONizable;
33 import org.turro.lock.Executable;
34 import org.turro.string.Strings;
42 private boolean active;
44 private int maxMinutes;
45 private int maxRequests;
47 private final Set<String> badRequest =
new HashSet<>();
48 private final Set<String> badAgent =
new HashSet<>();
49 private final Set<String> allowedDomains =
new HashSet<>();
54 private Set<String> whiteList;
55 private Set<String> blackList;
84 return request ==
null || badRequest.stream().anyMatch(br -> request.matches(br));
89 return agent ==
null || badAgent.stream().anyMatch(br -> agent.matches(br));
94 return !Strings.isBlank(domain) && allowedDomains.stream().anyMatch(wd -> domain.endsWith(wd));
137 public final static transient String
CRAWLERS_RESPONSE =
"/WEB-INF/elephant/activity/crawlers.html";
145 private final static transient String
146 CRAWLERS_FILE =
"/WEB-INF/elephant/activity/crawlers.json",
147 CRAWLERS_LIST =
"/WEB-INF/elephant/activity/crawlers-lists.json";
151 if(crawlers.exists()) {
155 if(whiteList !=
null && blackList !=
null && (!whiteList.isEmpty() || !blackList.isEmpty())) {
156 whiteList.forEach(ip -> cl.
getWhiteList().add(
new ActivityAction(ip, ip,
"")));
157 blackList.forEach(ip -> cl.
getBlackList().add(
new ActivityAction(ip, ip,
"")));
164 }
catch (Exception ex) {
166 Logger.getLogger(CrawlerOptions.class.getName()).log(Level.SEVERE,
null, ex);
169 return new CrawlerLists();
172 private void setAllowedFrom(CrawlerLists cl) {
173 allowedSet = cl.getAllowedSet();
176 private void saveDefinitions() {
177 save(
this, CRAWLERS_FILE);
180 private void saveLists(CrawlerLists cl) {
181 save(cl, CRAWLERS_LIST);
184 private static final Executable LOCK =
new Executable();
186 private void save(IJSONizable jSONizable, String file) {
188 Document crawlers = Document.from(ElephantContext.getRealPath(file));
190 crawlers.content(jSONizable.toJson());
191 } catch (IOException ex) {
192 Logger.getLogger(CrawlerOptions.class.getName()).log(Level.SEVERE,
null, ex);
197 private static CrawlerOptions load() {
198 Document crawlers = Document.from(ElephantContext.getRealPath(CRAWLERS_FILE));
199 if(crawlers.exists()) {
201 CrawlerOptions co = IJSONizable.fromJson(
202 crawlers.content(), CrawlerOptions.class);
204 co.setAllowedFrom(co.loadLists());
206 }
catch (Exception ex) {
208 Logger.getLogger(CrawlerOptions.class.getName()).log(Level.SEVERE,
null, ex);
211 return createDefaults();
214 private static CrawlerOptions createDefaults() {
215 CrawlerOptions co =
new CrawlerOptions();
218 co.maxRequests = 15 * 60 * 2;
223 private static CrawlerOptions addDefaults(CrawlerOptions co) {
224 co.allowedDomains.add(
".crawl.baidu.com");
225 co.allowedDomains.add(
".crawl.baidu.jp");
226 co.allowedDomains.add(
".search.msn.com");
227 co.allowedDomains.add(
".google.com");
228 co.allowedDomains.add(
".googlebot.com");
229 co.allowedDomains.add(
".crawl.yahoo.net");
230 co.allowedDomains.add(
".yandex.com");
231 co.allowedDomains.add(
".yandex.net");
232 co.allowedDomains.add(
".yandex.ru");
233 co.badRequest.add(
".*index.(php|asp).*");
234 co.badRequest.add(
".*phpmyadmin.*");
235 co.badRequest.add(
".*phpstorm.*");
236 co.badRequest.add(
".*phpunit.*");
237 co.badRequest.add(
".*crypto/\\.env.*");
238 co.badRequest.add(
".*invokefunction.*");
239 co.badRequest.add(
".*aws/credentials.*");
240 co.badRequest.add(
".*git/credentials.*");
241 co.badRequest.add(
".*Dockerrun.aws.json.*");
242 co.badRequest.add(
".*/cgi-bin/.*");
243 co.badRequest.add(
".*\\.well-known/security.txt.*");
244 co.badAgent.add(
".*MtmKilledYou.*");
256 public String
toJson(Map<String, Object> properties) {
257 return toJson(
this, properties);
static String getRealPath(String path)
boolean isWhiteListed(String ip)
boolean isBlackListed(String ip)
Set< ActivityAction > getBlackList()
Set< ActivityAction > getWhiteList()
String toJson(Map< String, Object > properties)
boolean isBadAgent(String agent)
static final transient String CRAWLERS_RESPONSE
Consumer< ActivityAction > onWhiteList()
void whiteList(ActivityAction action)
boolean isAllowedDomain(String domain)
boolean isBlackListed(String ip)
void blackList(ActivityAction action)
boolean isBadRequest(String request)
Consumer< ActivityAction > onBlackList()
boolean isWhiteListed(String ip)
static CrawlerOptions instance()