diff --git a/README.md b/README.md index 67bdc08f..c408e163 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# SpiderKeeper +# SpiderKeeper fork [![Latest Version](http://img.shields.io/pypi/v/SpiderKeeper.svg)](https://pypi.python.org/pypi/SpiderKeeper) [![Python Versions](http://img.shields.io/pypi/pyversions/SpiderKeeper.svg)](https://pypi.python.org/pypi/SpiderKeeper) @@ -12,6 +12,7 @@ A scalable admin ui for spider service - With a single click deploy the scrapy project - Show spider running stats - Provide api +- Show scraped items and download json/csv Current Support spider service diff --git a/SpiderKeeper/app/proxy/contrib/scrapy.py b/SpiderKeeper/app/proxy/contrib/scrapy.py index 9acad39e..f8fb60e7 100644 --- a/SpiderKeeper/app/proxy/contrib/scrapy.py +++ b/SpiderKeeper/app/proxy/contrib/scrapy.py @@ -86,3 +86,6 @@ def deploy(self, project_name, file_path): def log_url(self, project_name, spider_name, job_id): return self._scrapyd_url() + '/logs/%s/%s/%s.log' % (project_name, spider_name, job_id) + + def items_url(self, project_name, spider_name, job_id): + return self._scrapyd_url() + '/items/%s/%s/%s.jl' % (project_name, spider_name, job_id) diff --git a/SpiderKeeper/app/proxy/spiderctrl.py b/SpiderKeeper/app/proxy/spiderctrl.py index 2e47cfab..fa540265 100644 --- a/SpiderKeeper/app/proxy/spiderctrl.py +++ b/SpiderKeeper/app/proxy/spiderctrl.py @@ -182,6 +182,14 @@ def log_url(self, job_execution): return spider_service_instance.log_url(project.project_name, job_instance.spider_name, job_execution.service_job_execution_id) + def items_url(self, job_execution): + job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) + project = Project.find_project_by_id(job_instance.project_id) + for spider_service_instance in self.spider_service_instances: + if spider_service_instance.server == job_execution.running_on: + return spider_service_instance.items_url(project.project_name, job_instance.spider_name, + job_execution.service_job_execution_id) + @property def servers(self): return [self.spider_service_instance.server for self.spider_service_instance in diff --git a/SpiderKeeper/app/spider/controller.py b/SpiderKeeper/app/spider/controller.py index 296126b3..29e5eeb1 100644 --- a/SpiderKeeper/app/spider/controller.py +++ b/SpiderKeeper/app/spider/controller.py @@ -1,6 +1,8 @@ import datetime import os import tempfile +import json +import csv import flask_restful import requests @@ -10,6 +12,7 @@ from flask import redirect from flask import render_template from flask import session +from flask import send_from_directory from flask_restful_swagger import swagger from werkzeug.utils import secure_filename @@ -595,6 +598,48 @@ def job_log(project_id, job_exec_id): return render_template("job_log.html", log_lines=raw.split('\n')) +@app.route("/project//jobexecs//items") +def job_items(project_id, job_exec_id): + job_execution = JobExecution.query.filter_by(project_id=project_id, id=job_exec_id).first() + res = requests.get(agent.items_url(job_execution)) + res.encoding = 'utf8' + json_data = [ json.loads(s) for s in filter(None, res.text.split('\n'))] + return render_template("job_items.html", items=json_data) + + +@app.route("/project//jobexecs//items/download") +def download_items(project_id, job_exec_id): + format = request.args.get('format') + if not format in ['json', 'csv']: + abort(404) + + job_execution = JobExecution.query.filter_by(project_id=project_id, id=job_exec_id).first() + + job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) + project = Project.find_project_by_id(job_instance.project_id) + + res = requests.get(agent.items_url(job_execution)) + res.encoding = 'utf8' + json_data = [json.loads(s) for s in filter(None, res.text.split('\n'))] + + filename = '{}-{}.{}'.format(project.project_name, job_instance.spider_name, format) + if format == 'json': + open(os.path.join(app.static_folder, filename), 'w').write(json.dumps(json_data)) + elif format == 'csv': + f = open(os.path.join(app.static_folder, filename), 'w') + csvwriter = csv.writer(f) + count = 0 + for item in json_data: + if count == 0: + header = item.keys() + csvwriter.writerow(header) + count += 1 + csvwriter.writerow(item.values()) + f.close() + + return send_from_directory(app.static_folder, filename, as_attachment=True) + + @app.route("/project//job//run") def job_run(project_id, job_instance_id): job_instance = JobInstance.query.filter_by(project_id=project_id, id=job_instance_id).first() diff --git a/SpiderKeeper/app/static/css/app.css b/SpiderKeeper/app/static/css/app.css index 67a42216..4d395432 100644 --- a/SpiderKeeper/app/static/css/app.css +++ b/SpiderKeeper/app/static/css/app.css @@ -11,4 +11,16 @@ overflow: hidden; text-overflow: ellipsis; width: 100px; +} + +#table-count { + counter-reset: rowNumber; +} + +#table-count #item { + counter-increment: rowNumber; +} + +#table-count #item td:first-child::before { + content: counter(rowNumber); } \ No newline at end of file diff --git a/SpiderKeeper/app/templates/job_dashboard.html b/SpiderKeeper/app/templates/job_dashboard.html index 494b93d7..445ea9c1 100644 --- a/SpiderKeeper/app/templates/job_dashboard.html +++ b/SpiderKeeper/app/templates/job_dashboard.html @@ -88,6 +88,7 @@

Running Jobs

Runtime Started Log + Items Running On Action @@ -119,9 +120,12 @@

Running Jobs

{% endif %} {{ timedelta(now,job.start_time) }} {{ job.start_time }} - Log + Items + {{ job.running_on }} Completed Jobs Runtime Started Log + Items Status {% for job in job_status.COMPLETED %} {% if job.job_instance %} {{ job.job_execution_id }} - {{ job.job_instance_id }} + {{ job.job_instance_id }} {{ job.job_instance.spider_name }} {{ job.job_instance.spider_arguments }} @@ -184,9 +189,12 @@

Completed Jobs

{% endif %} {{ timedelta(job.end_time,job.start_time) }} {{ job.start_time }} - Log + Items + {% if job.running_status == 2 %} FINISHED diff --git a/SpiderKeeper/app/templates/job_items.html b/SpiderKeeper/app/templates/job_items.html new file mode 100644 index 00000000..3b613fb0 --- /dev/null +++ b/SpiderKeeper/app/templates/job_items.html @@ -0,0 +1,56 @@ +{% extends "base.html" %} +{% block content_header %} +

Job Items

+
    + + + +
+{% endblock %} +{% block content_body %} + +
+
+

Job Items

+
+ +
+
+
+ + {% if items|length > 0 %} + + + {% for key, value in items[0].items() %} + + {% endfor %} + + {% endif %} + + {% for item in items %} + + + {% for key, value in item.items() %} + + {% endfor %} + + {% endfor %} +
#{{ key }}
{{ value }}
+
+
+ +{% endblock %} \ No newline at end of file diff --git a/SpiderKeeper/app/templates/job_log.html b/SpiderKeeper/app/templates/job_log.html index a130775b..9401b19d 100644 --- a/SpiderKeeper/app/templates/job_log.html +++ b/SpiderKeeper/app/templates/job_log.html @@ -1,16 +1,38 @@ - - - - -{% for line in log_lines %} -

{{ line }}

-{% endfor %} - - \ No newline at end of file +{% extends "base.html" %} +{% block content_header %} +

Job Logs

+
    + +
+{% endblock %} +{% block content_body %} + +
+
+

Job Logs

+
+ +
+
+
+ + {% for line in log_lines %} + + + + {% endfor %} +

{{ line }}

+
+
+ +{% endblock %} \ No newline at end of file diff --git a/screenshot/screenshot_3.png b/screenshot/screenshot_3.png old mode 100644 new mode 100755