Skip to content

Commit

Permalink
Do not increase failed_rpc_counter while skaled init in progress
Browse files Browse the repository at this point in the history
  • Loading branch information
badrogger committed Jan 29, 2025
1 parent 7d02d9a commit eb5e23e
Show file tree
Hide file tree
Showing 4 changed files with 276 additions and 286 deletions.
27 changes: 12 additions & 15 deletions core/schains/monitor/rpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,50 +26,47 @@
from tools.docker_utils import DockerUtils

from tools.configs.schains import MAX_SCHAIN_FAILED_RPC_COUNT
from tools.configs.containers import (
MAX_SCHAIN_RESTART_COUNT,
SCHAIN_CONTAINER
)
from tools.configs.containers import MAX_SCHAIN_RESTART_COUNT, SCHAIN_CONTAINER

logger = logging.getLogger(__name__)


def handle_failed_schain_rpc(
schain: SchainStructure,
schain_record,
skaled_status,
dutils=None
):
def handle_failed_schain_rpc(schain: SchainStructure, schain_record, skaled_status, dutils=None):
dutils = dutils or DockerUtils()
logger.info(f'Monitoring RPC for sChain {schain.name}')

if not is_container_exists(schain.name, dutils=dutils):
logger.warning(f'{schain.name} RPC monitor failed: container doesn\'t exit')
logger.warning('RPC monitor failed: container does not exist')
return

if not is_container_running(schain.name, dutils=dutils):
logger.warning(f'{schain.name} RPC monitor failed: container is not running')
logger.warning('RPC monitor failed: container is not running')
return

if skaled_status.exit_time_reached:
logger.info(f'{schain.name} - Skipping RPC monitor: exit time reached')
logger.info('Skipping RPC monitor: exit time reached')
skaled_status.log()
schain_record.set_failed_rpc_count(0)
return

if skaled_status.downloading_snapshot:
logger.info(f'{schain.name} - Skipping RPC monitor: downloading snapshot')
logger.info('Skipping RPC monitor: downloading snapshot')
skaled_status.log()
schain_record.set_failed_rpc_count(0)
return

if not skaled_status.subsystem_running['Rpc']:
logger.info('Skipping RPC monitor: Rpc has not been initialized')
skaled_status.log()
schain_record.set_failed_rpc_count(0)

rpc_stuck = schain_record.failed_rpc_count > MAX_SCHAIN_FAILED_RPC_COUNT
logger.info(
'SChain %s, rpc stuck: %s, failed_rpc_count: %d, restart_count: %d',
schain.name,
rpc_stuck,
schain_record.failed_rpc_count,
schain_record.restart_count
schain_record.restart_count,
)
if rpc_stuck:
if schain_record.restart_count < MAX_SCHAIN_RESTART_COUNT:
Expand Down
35 changes: 3 additions & 32 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
ENV_TYPE,
META_FILEPATH,
SSL_CERTIFICATES_FILEPATH,
STATIC_GROUPS_FOLDER
STATIC_GROUPS_FOLDER,
)
from tools.configs.containers import CONTAINERS_FILEPATH
from tools.configs.ima import SCHAIN_IMA_ABI_FILEPATH
Expand All @@ -81,6 +81,7 @@
init_skale_from_wallet,
init_skale_ima,
upsert_schain_record_with_config,
generate_schain_skaled_status_file,
)

NUMBER_OF_NODES = 2
Expand Down Expand Up @@ -197,28 +198,6 @@ def get_random_string(length=8):
return ''.join(random.choice(letters) for i in range(length))


def get_skaled_status_dict(
snapshot_downloader=False,
exit_time_reached=False,
clear_data_dir=False,
start_from_snapshot=False,
start_again=False,
):
return {
'subsystemRunning': {
'SnapshotDownloader': snapshot_downloader,
'Blockchain': False,
'Rpc': False,
},
'exitState': {
'ClearDataDir': clear_data_dir,
'StartAgain': start_again,
'StartFromSnapshot': start_from_snapshot,
'ExitTimeReached': exit_time_reached,
},
}


SECRET_KEY = {
'common_public_key': [
11111111111111111111111111111111111111111111111111111111111111111111111111111,
Expand Down Expand Up @@ -305,13 +284,6 @@ def schain_config(_schain_name, secret_key, predeployed_ima):
rm_schain_dir(_schain_name)


def generate_schain_skaled_status_file(_schain_name, **kwargs):
schain_dir_path = os.path.join(SCHAINS_DIR_PATH, _schain_name)
pathlib.Path(schain_dir_path).mkdir(parents=True, exist_ok=True)
status_filepath = skaled_status_filepath(_schain_name)
write_json(status_filepath, get_skaled_status_dict(**kwargs))


def rm_schain_dir(schain_name):
schain_dir_path = os.path.join(SCHAINS_DIR_PATH, schain_name)
# fix permission denied after schain container running
Expand Down Expand Up @@ -607,8 +579,7 @@ def static_groups_for_schain(_schain_name):
parent_folder = os.path.join(STATIC_GROUPS_FOLDER, ENV_TYPE)
os.makedirs(parent_folder)
static_groups_env_path = os.path.join(
parent_folder,
os.path.join(f'schain-{_schain_name}.json')
parent_folder, os.path.join(f'schain-{_schain_name}.json')
)
try:
write_json(static_groups_env_path, STATIC_NODE_GROUPS)
Expand Down
38 changes: 30 additions & 8 deletions tests/schains/monitor/rpc_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@
from core.schains.monitor.rpc import handle_failed_schain_rpc
from core.schains.runner import get_container_info
from core.schains.rpc import check_endpoint_blocks
from tools.configs.containers import SCHAIN_CONTAINER
from tools.configs.containers import SCHAIN_CONTAINER, MAX_SCHAIN_RESTART_COUNT
from tools.configs.schains import MAX_SCHAIN_FAILED_RPC_COUNT
from web.models.schain import SChainRecord
from tests.utils import get_schain_struct
from tests.utils import get_schain_struct, generate_schain_skaled_status_file

CURRENT_TIMESTAMP = 1594903080
CURRENT_DATETIME = datetime.datetime.utcfromtimestamp(CURRENT_TIMESTAMP)
Expand Down Expand Up @@ -39,7 +40,7 @@ def test_handle_failed_schain_rpc_exit_time_reached(

dutils.run_container(image_name=image_name, name=container_name, entrypoint='bash -c "exit 0"')
time.sleep(7)
schain_record.set_failed_rpc_count(100)
schain_record.set_failed_rpc_count(MAX_SCHAIN_FAILED_RPC_COUNT)

container_info = dutils.get_info(container_name)
finished_at = container_info['stats']['State']['FinishedAt']
Expand Down Expand Up @@ -67,7 +68,7 @@ def test_monitor_schain_downloading_snapshot(
image_name=image_name, name=container_name, entrypoint='bash -c "sleep 100"'
)
time.sleep(7)
schain_record.set_failed_rpc_count(100)
schain_record.set_failed_rpc_count(MAX_SCHAIN_FAILED_RPC_COUNT)

container_info = dutils.get_info(container_name)
finished_at = container_info['stats']['State']['FinishedAt']
Expand All @@ -91,8 +92,8 @@ def test_handle_failed_schain_rpc_stuck_max_retries(
image_name=image_name, name=container_name, entrypoint='bash -c "sleep 100"'
)

schain_record.set_failed_rpc_count(100)
schain_record.set_restart_count(100)
schain_record.set_failed_rpc_count(MAX_SCHAIN_FAILED_RPC_COUNT)
schain_record.set_restart_count(MAX_SCHAIN_RESTART_COUNT + 1)

container_info = dutils.get_info(container_name)
finished_at = container_info['stats']['State']['FinishedAt']
Expand All @@ -116,7 +117,7 @@ def test_monitor_container_exited(schain_db, dutils, cleanup_schain_containers,
# Wait for container initialization
time.sleep(2)

schain_record.set_failed_rpc_count(100)
schain_record.set_failed_rpc_count(MAX_SCHAIN_FAILED_RPC_COUNT)
schain_record.set_restart_count(0)

container_info = dutils.get_info(container_name)
Expand Down Expand Up @@ -145,12 +146,33 @@ def test_handle_failed_schain_rpc_stuck(
image_name=image_name, name=container_name, entrypoint='bash -c "sleep 100"'
)

schain_record.set_failed_rpc_count(100)
schain_record.set_failed_rpc_count(MAX_SCHAIN_FAILED_RPC_COUNT)
schain_record.set_restart_count(0)

container_info = dutils.get_info(container_name)
finished_at = container_info['stats']['State']['FinishedAt']

assert schain_record.restart_count == 0

# Make sure restart is not executed with Rpc: False in status file
generate_schain_skaled_status_file(schain_db, rpc=False)
handle_failed_schain_rpc(
schain=get_schain_struct(schain_name=schain_db),
schain_record=schain_record,
skaled_status=skaled_status,
dutils=dutils,
)
assert schain_record.restart_count == 0
container_info = dutils.get_info(container_name)
assert container_info['stats']['State']['FinishedAt'] == finished_at

container_info = dutils.get_info(container_name)
finished_at = container_info['stats']['State']['FinishedAt']

# With Rpc: True restart should be executed
generate_schain_skaled_status_file(schain_db, rpc=True)
schain_record.set_failed_rpc_count(100)

assert schain_record.restart_count == 0
handle_failed_schain_rpc(
schain=get_schain_struct(schain_name=schain_db),
Expand Down
Loading

0 comments on commit eb5e23e

Please sign in to comment.