Skip to content

Commit

Permalink
container-updater: temporary account update suppression on errors
Browse files Browse the repository at this point in the history
  • Loading branch information
gholt authored and Tarmac committed Jan 26, 2011
2 parents 2467d8b + d41e774 commit edb4e90
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 19 deletions.
32 changes: 19 additions & 13 deletions doc/source/deployment_guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -371,19 +371,25 @@ reclaim_age 604800 Time elapsed in seconds before a

[container-updater]

================== ================= =======================================
Option Default Description
------------------ ----------------- ---------------------------------------
log_name container-updater Label used when logging
log_facility LOG_LOCAL0 Syslog log facility
log_level INFO Logging level
interval 300 Minimum time for a pass to take
concurrency 4 Number of updater workers to spawn
node_timeout 3 Request timeout to external services
conn_timeout 0.5 Connection timeout to external services
slowdown 0.01 Time in seconds to wait between
containers
================== ================= =======================================
======================== ================= ==================================
Option Default Description
------------------------ ----------------- ----------------------------------
log_name container-updater Label used when logging
log_facility LOG_LOCAL0 Syslog log facility
log_level INFO Logging level
interval 300 Minimum time for a pass to take
concurrency 4 Number of updater workers to spawn
node_timeout 3 Request timeout to external
services
conn_timeout 0.5 Connection timeout to external
services
slowdown 0.01 Time in seconds to wait between
containers
account_suppression_time 60 Seconds to suppress updating an
account that has generated an
error (timeout, not yet found,
etc.)
======================== ================= ==================================

[container-auditor]

Expand Down
2 changes: 2 additions & 0 deletions etc/container-server.conf-sample
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ use = egg:swift#container
# conn_timeout = 0.5
# slowdown will sleep that amount between containers
# slowdown = 0.01
# Seconds to suppress updating an account that has generated an error
# account_suppression_time = 60

[container-auditor]
# You can override the default log routing for this app here (don't use set!):
Expand Down
53 changes: 47 additions & 6 deletions swift/container/updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import sys
import time
from random import random, shuffle
from tempfile import mkstemp

from eventlet import spawn, patcher, Timeout

Expand Down Expand Up @@ -51,6 +52,10 @@ def __init__(self, conf):
self.no_changes = 0
self.successes = 0
self.failures = 0
self.account_suppressions = {}
self.account_suppression_time = \
float(conf.get('account_suppression_time', 60))
self.new_account_suppressions = None

def get_account_ring(self):
"""Get the account ring. Load it if it hasn't been yet."""
Expand Down Expand Up @@ -80,6 +85,19 @@ def get_paths(self):
shuffle(paths)
return paths

def _load_suppressions(self, filename):
try:
with open(filename, 'r') as tmpfile:
for line in tmpfile:
account, until = line.split()
until = float(until)
self.account_suppressions[account] = until
except:
self.logger.exception(
_('ERROR with loading suppressions from %s: ') % filename)
finally:
os.unlink(filename)

def run_forever(self): # pragma: no cover
"""
Run the updator continuously.
Expand All @@ -88,21 +106,33 @@ def run_forever(self): # pragma: no cover
while True:
self.logger.info(_('Begin container update sweep'))
begin = time.time()
pids = []
now = time.time()
expired_suppressions = \
[a for a, u in self.account_suppressions.iteritems() if u < now]
for account in expired_suppressions:
del self.account_suppressions[account]
pid2filename = {}
# read from account ring to ensure it's fresh
self.get_account_ring().get_nodes('')
for path in self.get_paths():
while len(pids) >= self.concurrency:
pids.remove(os.wait()[0])
while len(pid2filename) >= self.concurrency:
pid = os.wait()[0]
try:
self._load_suppressions(pid2filename[pid])
finally:
del pid2filename[pid]
fd, tmpfilename = mkstemp()
os.close(fd)
pid = os.fork()
if pid:
pids.append(pid)
pid2filename[pid] = tmpfilename
else:
signal.signal(signal.SIGTERM, signal.SIG_DFL)
patcher.monkey_patch(all=False, socket=True)
self.no_changes = 0
self.successes = 0
self.failures = 0
self.new_account_suppressions = open(tmpfilename, 'w')
forkbegin = time.time()
self.container_sweep(path)
elapsed = time.time() - forkbegin
Expand All @@ -114,8 +144,12 @@ def run_forever(self): # pragma: no cover
'success': self.successes, 'fail': self.failures,
'no_change': self.no_changes})
sys.exit()
while pids:
pids.remove(os.wait()[0])
while pid2filename:
pid = os.wait()[0]
try:
self._load_suppressions(pid2filename[pid])
finally:
del pid2filename[pid]
elapsed = time.time() - begin
self.logger.info(_('Container update sweep completed: %.02fs'),
elapsed)
Expand Down Expand Up @@ -165,6 +199,8 @@ def process_container(self, dbfile):
# definitely doesn't have up to date statistics.
if float(info['put_timestamp']) <= 0:
return
if self.account_suppressions.get(info['account'], 0) > time.time():
return
if info['put_timestamp'] > info['reported_put_timestamp'] or \
info['delete_timestamp'] > info['reported_delete_timestamp'] \
or info['object_count'] != info['reported_object_count'] or \
Expand Down Expand Up @@ -195,6 +231,11 @@ def process_container(self, dbfile):
self.logger.debug(
_('Update report failed for %(container)s %(dbfile)s'),
{'container': container, 'dbfile': dbfile})
self.account_suppressions[info['account']] = until = \
time.time() + self.account_suppression_time
if self.new_account_suppressions:
print >>self.new_account_suppressions, \
info['account'], until
else:
self.no_changes += 1

Expand Down
1 change: 1 addition & 0 deletions test/unit/container/test_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def test_run_once(self):
'interval': '1',
'concurrency': '1',
'node_timeout': '15',
'account_suppression_time': 0
})
cu.run_once()
containers_dir = os.path.join(self.sda1, container_server.DATADIR)
Expand Down

0 comments on commit edb4e90

Please sign in to comment.