diff --git a/src/crested/tl/_modisco_utils.py b/src/crested/tl/_modisco_utils.py index f9f9d7d3..ba4b826e 100644 --- a/src/crested/tl/_modisco_utils.py +++ b/src/crested/tl/_modisco_utils.py @@ -99,7 +99,6 @@ def _trim_pattern_by_ic( return _trim(pattern, start_idx, end_idx) - def _trim(pattern: dict, start_idx: int, end_idx: int) -> dict: """ Trims the pattern to the specified start and end indices. @@ -117,10 +116,15 @@ def _trim(pattern: dict, start_idx: int, end_idx: int) -> dict: ------- Trimmed pattern. """ + # TODO: Reading the pattern from disk should really be done in a seperate function! + seqlet_dict = {} + # read seqlet information + for k in pattern["seqlets"].keys(): + seqlet_dict[k] = pattern["seqlets"][k][:] + # do actual trimming seqlets_sequences = pattern['seqlets']['sequence'] trimmed_sequences = [seq[start_idx:end_idx] for seq in seqlets_sequences] - seqlet_dict = {} seqlet_dict['sequence'] = trimmed_sequences return { "sequence": np.array(pattern["sequence"])[start_idx:end_idx], diff --git a/src/crested/tl/_tfmodisco.py b/src/crested/tl/_tfmodisco.py index 034e0001..6e3e398b 100644 --- a/src/crested/tl/_tfmodisco.py +++ b/src/crested/tl/_tfmodisco.py @@ -636,12 +636,16 @@ def process_patterns( f"{cell_type.replace(' ', '_')}_{metacluster_name}_{pattern_idx}" ) is_pos = metacluster_name == "pos_patterns" - trimmed_patterns.append( - _trim_pattern_by_ic( + pattern = _trim_pattern_by_ic( hdf5_results[metacluster_name][p], is_pos, trim_ic_threshold ) + # store file path so it is possible to track back + # where the pattern comes from. + pattern["file_path"] = h5_file + trimmed_patterns.append( + pattern ) is_pattern_pos.append(is_pos) pattern_idx = pattern_idx + 1