GSOC Spider code

Signed-off-by: Vipul Gupta (@vipulgupta2048) <[email protected]>
Sidddharthh · Jun 11, 2019 · 7814e0e · 7814e0e
1 parent 3485342
commit 7814e0e
Show file tree

Hide file tree

Showing 33 changed files with 1,087 additions and 0 deletions.
diff --git a/placement/.gitignore b/placement/.gitignore
@@ -0,0 +1,105 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+/*/__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
diff --git a/placement/.vscode/settings.json b/placement/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.pythonPath": "spider/bin/python3.6"
+}
diff --git a/placement/__init__.py b/placement/__init__.py
diff --git a/placement/items.py b/placement/items.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+class Product(scrapy.Item):
+    link = scrapy.Field()
+    name = scrapy.Field()
+    year = scrapy.Field()
diff --git a/placement/middlewares.py b/placement/middlewares.py
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class PlacementSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class PlacementDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
diff --git a/placement/monitors.py b/placement/monitors.py
@@ -0,0 +1,38 @@
+from spidermon import Monitor, MonitorSuite, monitors
+from spidermon.contrib.monitors.mixins import StatsMonitorMixin
+# @monitors.name('Item count')
+# class ItemCountMonitor(Monitor):
+
+#     @monitors.name('Minimum number of items')
+#     def test_minimum_number_of_items(self):
+#         item_extracted = getattr(
+#             self.data.stats, 'item_scraped_count', 0)
+#         minimum_threshold = 10
+
+#         msg = 'Extracted less than {} items'.format(
+#             minimum_threshold)
+#         self.assertTrue(
+#             item_extracted >= minimum_threshold, msg=msg
+#         )
+
+@monitors.name('Item validation')
+class ItemValidationMonitor(Monitor, StatsMonitorMixin):
+# class ItemValidationMonitor(Monitor):
+
+    @monitors.name('No item validation errors')
+    def test_no_item_validation_errors(self):
+        validation_errors = getattr(
+            self.stats, 'spidermon/validation/fields/errors', 0
+        )
+        self.assertEqual(
+            validation_errors,
+            0,
+            msg='Found validation errors in {} fields'.format(
+                validation_errors)
+        )
+
+class SpiderCloseMonitorSuite(MonitorSuite):
+    monitors = [
+        # ItemCountMonitor,
+        ItemValidationMonitor,
+    ]
diff --git a/placement/pipelines.py b/placement/pipelines.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+class PlacementPipeline(object):
+    def process_item(self, item, spider):
+        return item
diff --git a/placement/schema.json b/placement/schema.json
@@ -0,0 +1,15 @@
+{
+  "link": {
+    "type": "string",
+    "required": true
+  },
+  "name": {
+    "type": "Integer",
+    "required": true
+  },
+  "year": {
+    "type": "integer",
+    "max": 2021,
+    "min": 2018
+  }
+}
diff --git a/placement/schemax.json b/placement/schemax.json
@@ -0,0 +1,23 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Placement Opportunities",
+  "description": "Placement opportunities at Amity University",
+  "type": "object",
+  "properties": {
+    "link": {
+      "description": "URL to more information about opportunity",
+      "type": "integer"
+    },
+    "name": {
+      "description": "Title of the opportunity",
+      "type": "string"
+    },
+    "year": {
+      "description": "For students of which year",
+      "type": "integer",
+      "maximum": 2021,
+      "minimum": 2018
+    }
+  },
+  "required": ["link", "name"]
+}