diff --git a/libs/libcalg.so b/libs/libcalg.so new file mode 100644 index 0000000..a7664db Binary files /dev/null and b/libs/libcalg.so differ diff --git a/libs/libcalg.so.0.0.0 b/libs/libcalg.so.0.0.0 new file mode 100644 index 0000000..a7664db Binary files /dev/null and b/libs/libcalg.so.0.0.0 differ diff --git a/requirements.txt b/requirements.txt index 266024d..f721f1e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,12 @@ -scrapy -pybloomfilter -requests -bs4 -twisted==16.6.0 -w3lib -lxml -six -cssselect -pyopenssl -cryptography -queuelib +scrapy==1.8.4 +pybloomfilter==1.0 +requests==2.27.1 +beautifulsoup4==4.9.3 +twisted==20.3.0 +w3lib==1.22.0 +lxml==5.0.1 +six==1.16.0 +cssselect==1.1.0 +pyopenssl==21.0.0 +cryptography==3.3.2 +queuelib==1.5.0 diff --git a/xsscrapy/bloomfilter.py b/xsscrapy/bloomfilter.py new file mode 100644 index 0000000..48b2ba5 --- /dev/null +++ b/xsscrapy/bloomfilter.py @@ -0,0 +1,119 @@ +# pybloomfilter.py +# +# Copyright 2009 ahmed youssef +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. + + + +__all__=["BloomFilter"] + + +from ctypes import * +import os + +lib_path = os.path.abspath("libs/libcalg.so") +calg = CDLL(lib_path) +if not calg: + print "Error loading libcalg.so" + exit(1) + +string_hash=calg.string_hash +string_hash.restype=c_ulong +string_hash.argstype=[c_void_p] + +int_hash=calg.int_hash +int_hash.restype=c_ulong +int_hash.argstype=[c_void_p] + +# bloomfilter c-definitions + +bloomfilter_value=c_void_p +HASH_FUNC=CFUNCTYPE(c_ulong, bloomfilter_value) + +class BloomFilterStruct(Structure): + __fields__=[ + ("hash_func", HASH_FUNC), + ("table", POINTER(c_ubyte)), + ("table_size", c_uint ), + ("num_functions",c_uint ), + ] + +bloomfilter_p = POINTER(BloomFilterStruct) + +bf_new = calg.bloom_filter_new +bf_new.restype = bloomfilter_p +bf_new.argstype = [c_uint, HASH_FUNC, c_uint] + +bf_free=calg.bloom_filter_free +bf_free.restype=None +bf_free.argstype=[bloomfilter_p] + +bf_insert = calg.bloom_filter_insert +bf_insert.restype = None +bf_insert.argstype = [bloomfilter_p, bloomfilter_value] + +bf_query = calg.bloom_filter_query +bf_query.restype = c_int +bf_query.argstype = [bloomfilter_p, bloomfilter_value] + +# python wrapper + +class BloomFilter: + + def __init__(self, table_size=128, hash_func=string_hash, num_functions=1): + """ + A bloom filter is a space efficient data structure that can be used to test whether a given element is part of a set. + Lookups will occasionally generate false positives, but never false negatives. + """ + self._bloomfilter = bf_new(table_size, hash_func, num_functions) + + def insert(self, val): + """ + Insert a value into the bloom filter. + """ + bf_insert(self._bloomfilter, str(val)) + + def query(self, val): + """ + Query a bloom filter for a particular value. + """ + return bf_query(self._bloomfilter, str(val)) + + def __contains__(self, val): + """ + Check if a value is in the bloom filter. + """ + return self.query(val) + + def __del__(self): + """ + Explicitly free the resources allocated by the bloom filter. + """ + if self._bloomfilter: + bf_free(self._bloomfilter) + + +if __name__=="__main__": + + b=BloomFilter() + b.insert("ahmed") + b.insert("ayman") + print "ahmed" in b + print "ayman" in b + print "memo" in b + + del b diff --git a/xsscrapy/bloomfilters.py b/xsscrapy/bloomfilters.py index a0d9efb..c31d5b4 100644 --- a/xsscrapy/bloomfilters.py +++ b/xsscrapy/bloomfilters.py @@ -1,4 +1,4 @@ -from pybloomfilter import BloomFilter +from xsscrapy.bloomfilter import BloomFilter from scrapy.utils.job import job_dir from scrapy.dupefilters import BaseDupeFilter from xsscrapy.settings import bloomfilterSize @@ -8,7 +8,7 @@ class BloomURLDupeFilter(BaseDupeFilter): def __init__(self, path=None): self.file = None - self.fingerprints = BloomFilter(bloomfilterSize*10, 0.0001) + self.fingerprints = BloomFilter(bloomfilterSize) @classmethod def from_settings(cls, settings): @@ -18,7 +18,7 @@ def request_seen(self, request): fp = request.url if fp in self.fingerprints: return True - self.fingerprints.add(fp) + self.fingerprints.insert(fp) def close(self, reason): self.fingerprints = None diff --git a/xsscrapy/middlewares.py b/xsscrapy/middlewares.py index 3693811..903ca8b 100644 --- a/xsscrapy/middlewares.py +++ b/xsscrapy/middlewares.py @@ -1,26 +1,24 @@ from scrapy.exceptions import IgnoreRequest -from urllib.parse import unquote -from pybloomfilter import BloomFilter +from urlparse import unquote +from xsscrapy.bloomfilter import BloomFilter import random import re from xsscrapy.settings import bloomfilterSize -# Filter out duplicate requests with Bloom filters since they're much easier on memory -#URLS_FORMS_HEADERS = BloomFilter(3000000, 0.00001) -URLS_SEEN = BloomFilter(bloomfilterSize, .0001) -FORMS_SEEN = BloomFilter(bloomfilterSize, .0001) -HEADERS_SEEN = BloomFilter(bloomfilterSize, .0001) -USER_AGENT_LIST = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0'] - class RandomUserAgentMiddleware(object): ''' Use a random user-agent for each request ''' + + USER_AGENT_LIST = [ + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14', + 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36', + 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0' + ] + def process_request(self, request, spider): - ua = random.choice(USER_AGENT_LIST) + ua = random.choice(RandomUserAgentMiddleware.USER_AGENT_LIST) if 'payload' in request.meta: payload = request.meta['payload'] if 'User-Agent' in request.headers: @@ -33,6 +31,10 @@ def process_request(self, request, spider): class InjectedDupeFilter(object): ''' Filter duplicate payloaded URLs, headers, and forms since all of those have dont_filter = True ''' + URLS_SEEN = BloomFilter(bloomfilterSize) + FORMS_SEEN = BloomFilter(bloomfilterSize) + HEADERS_SEEN = BloomFilter(bloomfilterSize) + def process_request(self, request, spider): meta = request.meta @@ -46,10 +48,10 @@ def process_request(self, request, spider): #replace the delim characters with nothing so we only test the URL #with the payload no_delim_url = url.replace(delim, '') - if no_delim_url in URLS_SEEN: + if no_delim_url in InjectedDupeFilter.URLS_SEEN: raise IgnoreRequest spider.log('Sending payloaded URL: %s' % url) - URLS_SEEN.add(url) + InjectedDupeFilter.URLS_SEEN.insert(no_delim_url) return # Injected form dupe handling @@ -57,10 +59,10 @@ def process_request(self, request, spider): u = meta['POST_to'] p = meta['xss_param'] u_p = (u, p) - if u_p in FORMS_SEEN: + if u_p in InjectedDupeFilter.FORMS_SEEN: raise IgnoreRequest spider.log('Sending payloaded form param %s to: %s' % (p, u)) - FORMS_SEEN.add(u_p) + InjectedDupeFilter.FORMS_SEEN.insert(u_p) return # Injected header dupe handling @@ -69,8 +71,8 @@ def process_request(self, request, spider): h = meta['xss_param'] # URL, changed header, payload u_h = (u, h) - if u_h in HEADERS_SEEN: + if u_h in InjectedDupeFilter.HEADERS_SEEN: raise IgnoreRequest spider.log('Sending payloaded %s header' % h) - HEADERS_SEEN.add(u_h) + InjectedDupeFilter.HEADERS_SEEN.insert(u_h) return diff --git a/xsscrapy/pipelines.py b/xsscrapy/pipelines.py index bead038..0dbf451 100644 --- a/xsscrapy/pipelines.py +++ b/xsscrapy/pipelines.py @@ -3,8 +3,8 @@ # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.exceptions import DropItem -from html.parser import HTMLParser from xsscrapy.items import vuln#, inj_resp +import HTMLParser import re import lxml.etree import lxml.html @@ -12,7 +12,7 @@ import itertools #from IPython import embed from socket import gaierror, gethostbyname -from urllib.parse import urlparse +from urlparse import urlparse from logging import CRITICAL, ERROR, WARNING, INFO, DEBUG class XSSCharFinder(object): @@ -26,7 +26,7 @@ def get_filename(self, url): filename = up + '.txt' return filename - + def open_spider(self, spider): self.filename = self.get_filename(spider.url)