Skip to content

Commit

Permalink
Merge pull request #398 from makelinux/generated_data
Browse files Browse the repository at this point in the history
refactor: generated_data as list
  • Loading branch information
bbrowning authored Dec 10, 2024
2 parents fd53dcd + 3fce0a5 commit dcbabc5
Showing 1 changed file with 3 additions and 9 deletions.
12 changes: 3 additions & 9 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ def generate_data(
"Synthesizing new instructions. If you aren't satisfied with the generated instructions, interrupt training (Ctrl-C) and try adjusting your YAML files. Adding more examples may help."
)

generated_data = None
generated_data = []
empty_sdg_leaf_nodes = []
for leaf_node in leaf_nodes.values():
is_knowledge = False
Expand Down Expand Up @@ -424,11 +424,8 @@ def generate_data(
empty_sdg_leaf_nodes.append(leaf_node_path)
logger.warning("Empty dataset for qna node: %s", leaf_node_path)
continue
generated_data = (
[new_generated_data]
if generated_data is None
else generated_data + [new_generated_data]
)
generated_data.append(new_generated_data)

logger.info("Generated %d samples", len(generated_data))
logger.debug("Generated data: %s", generated_data)

Expand All @@ -449,9 +446,6 @@ def generate_data(
use_legacy_pretraining_format,
)

if generated_data is None:
generated_data = []

_gen_train_data(
generated_data,
os.path.join(output_dir, output_file_train),
Expand Down

0 comments on commit dcbabc5

Please sign in to comment.