Skip to content

Commit

Permalink
GSOC Spider code
Browse files Browse the repository at this point in the history
Signed-off-by: Vipul Gupta (@vipulgupta2048) <[email protected]>
  • Loading branch information
vipulgupta2048 committed Jun 11, 2019
1 parent 3485342 commit 7814e0e
Show file tree
Hide file tree
Showing 33 changed files with 1,087 additions and 0 deletions.
105 changes: 105 additions & 0 deletions placement/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# Byte-compiled / optimized / DLL files
__pycache__/
/*/__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
3 changes: 3 additions & 0 deletions placement/.vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"python.pythonPath": "spider/bin/python3.6"
}
Empty file added placement/__init__.py
Empty file.
13 changes: 13 additions & 0 deletions placement/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class Product(scrapy.Item):
link = scrapy.Field()
name = scrapy.Field()
year = scrapy.Field()
103 changes: 103 additions & 0 deletions placement/middlewares.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals


class PlacementSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.

@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.

# Should return None or raise an exception.
return None

def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.

# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i

def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.

# Should return either None or an iterable of Response, dict
# or Item objects.
pass

def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.

# Must return only requests (not items).
for r in start_requests:
yield r

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)


class PlacementDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.

@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.

# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None

def process_response(self, request, response, spider):
# Called with the response returned from the downloader.

# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response

def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.

# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
38 changes: 38 additions & 0 deletions placement/monitors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from spidermon import Monitor, MonitorSuite, monitors
from spidermon.contrib.monitors.mixins import StatsMonitorMixin
# @monitors.name('Item count')
# class ItemCountMonitor(Monitor):

# @monitors.name('Minimum number of items')
# def test_minimum_number_of_items(self):
# item_extracted = getattr(
# self.data.stats, 'item_scraped_count', 0)
# minimum_threshold = 10

# msg = 'Extracted less than {} items'.format(
# minimum_threshold)
# self.assertTrue(
# item_extracted >= minimum_threshold, msg=msg
# )

@monitors.name('Item validation')
class ItemValidationMonitor(Monitor, StatsMonitorMixin):
# class ItemValidationMonitor(Monitor):

@monitors.name('No item validation errors')
def test_no_item_validation_errors(self):
validation_errors = getattr(
self.stats, 'spidermon/validation/fields/errors', 0
)
self.assertEqual(
validation_errors,
0,
msg='Found validation errors in {} fields'.format(
validation_errors)
)

class SpiderCloseMonitorSuite(MonitorSuite):
monitors = [
# ItemCountMonitor,
ItemValidationMonitor,
]
11 changes: 11 additions & 0 deletions placement/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


class PlacementPipeline(object):
def process_item(self, item, spider):
return item
15 changes: 15 additions & 0 deletions placement/schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"link": {
"type": "string",
"required": true
},
"name": {
"type": "Integer",
"required": true
},
"year": {
"type": "integer",
"max": 2021,
"min": 2018
}
}
23 changes: 23 additions & 0 deletions placement/schemax.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Placement Opportunities",
"description": "Placement opportunities at Amity University",
"type": "object",
"properties": {
"link": {
"description": "URL to more information about opportunity",
"type": "integer"
},
"name": {
"description": "Title of the opportunity",
"type": "string"
},
"year": {
"description": "For students of which year",
"type": "integer",
"maximum": 2021,
"minimum": 2018
}
},
"required": ["link", "name"]
}
Loading

0 comments on commit 7814e0e

Please sign in to comment.