diff --git a/starfile/functions.py b/starfile/functions.py index e952d29..4a1be42 100644 --- a/starfile/functions.py +++ b/starfile/functions.py @@ -22,6 +22,7 @@ def read(filename: PathLike, read_n_blocks: int = None, always_dict: bool = Fals default behaviour in the case of only one data block being present in the STAR file is to return only a dataframe, this can be changed by setting 'always_dict=True' """ + parser = StarParser(filename, n_blocks_to_read=read_n_blocks) if len(parser.data_blocks) == 1 and always_dict is False: return list(parser.data_blocks.values())[0] @@ -35,6 +36,8 @@ def write( float_format: str = '%.6f', sep: str = '\t', na_rep: str = '', + quote_character: str = '"', + quote_all_strings: bool = False, **kwargs, ): """Write data blocks as STAR files.""" @@ -43,5 +46,7 @@ def write( filename=filename, float_format=float_format, na_rep=na_rep, - separator=sep + separator=sep, + quote_character=quote_character, + quote_all_strings=quote_all_strings, ) diff --git a/starfile/parser.py b/starfile/parser.py index dc98a39..0b485a7 100644 --- a/starfile/parser.py +++ b/starfile/parser.py @@ -3,6 +3,7 @@ from collections import deque from io import StringIO from linecache import getline +import shlex import numpy as np import pandas as pd @@ -71,7 +72,7 @@ def _parse_simple_block(self) -> Dict[str, Union[str, int, float]]: if self.current_line.startswith('data'): break elif self.current_line.startswith('_'): # '_foo bar' - k, v = self.current_line.split() + k, v = shlex.split(self.current_line) block[k[1:]] = numericise(v) self.current_line_number += 1 return block @@ -103,12 +104,16 @@ def _parse_loop_block(self) -> pd.DataFrame: df = pd.DataFrame(np.zeros(shape=(0, n_cols))) else: df = pd.read_csv( - StringIO(loop_data), + StringIO(loop_data.replace("'",'"')), delim_whitespace=True, header=None, - comment='#' + comment='#', + keep_default_na=False ) - df = df.apply(pd.to_numeric, errors='ignore') + df_numeric = df.apply(pd.to_numeric, errors='ignore') + # Replace columns that are all NaN with the original string columns + df_numeric.loc[:, df_numeric.isna().all()] = df.loc[:, df_numeric.isna().all()] + df = df_numeric df.columns = loop_column_names return df diff --git a/starfile/writer.py b/starfile/writer.py index 2d3a07c..3ec92d1 100644 --- a/starfile/writer.py +++ b/starfile/writer.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Union, Dict, List from importlib.metadata import version +import csv import pandas as pd @@ -24,6 +25,8 @@ def __init__( float_format: str = '%.6f', separator: str = '\t', na_rep: str = '', + quote_character: str = '"', + quote_all_strings: bool = False, ): # coerce data self.data_blocks = self.coerce_data_blocks(data_blocks) @@ -33,6 +36,8 @@ def __init__( self.float_format = float_format self.sep = separator self.na_rep = na_rep + self.quote_character = quote_character + self.quote_all_strings = quote_all_strings self.buffer = TextBuffer() self.backup_if_file_exists() self.write() @@ -67,7 +72,9 @@ def write_data_blocks(self): write_simple_block( file=self.filename, block_name=block_name, - data=block + data=block, + quote_character=self.quote_character, + quote_all_strings=self.quote_all_strings ) elif isinstance(block, pd.DataFrame): write_loop_block( @@ -77,6 +84,8 @@ def write_data_blocks(self): float_format=self.float_format, separator=self.sep, na_rep=self.na_rep, + quote_character=self.quote_character, + quote_all_strings=self.quote_all_strings ) def backup_if_file_exists(self): @@ -123,13 +132,22 @@ def write_package_info(file: Path): def write_simple_block( file: Path, block_name: str, - data: Dict[str, Union[str, int, float]] -): + data: Dict[str, Union[str, int, float]], + quote_character: str = '"', + quote_all_strings: bool = False +): + quoted_data = { + k: f"{quote_character}{v}{quote_character}" + if isinstance(v, str) and (quote_all_strings or " " in v or v == "") + else v + for k, v + in data.items() + } formatted_lines = '\n'.join( [ f'_{k}\t\t\t{v}' for k, v - in data.items() + in quoted_data.items() ] ) with open(file, mode='a') as f: @@ -145,6 +163,8 @@ def write_loop_block( float_format: str = '%.6f', separator: str = '\t', na_rep: str = '', + quote_character: str = '"', + quote_all_strings: bool = False ): # write header header_lines = [ @@ -158,6 +178,10 @@ def write_loop_block( f.write('\n'.join(header_lines)) f.write('\n') + df = df.applymap(lambda x: f'{quote_character}{x}{quote_character}' + if isinstance(x, str) and (quote_all_strings or " " in x or x == "") + else x) + # write data df.to_csv( path_or_buf=file, @@ -167,5 +191,6 @@ def write_loop_block( index=False, float_format=float_format, na_rep=na_rep, + quoting=csv.QUOTE_NONE ) write_blank_lines(file, n=2) diff --git a/tests/constants.py b/tests/constants.py index 3f52707..9a9ad68 100644 --- a/tests/constants.py +++ b/tests/constants.py @@ -21,6 +21,10 @@ two_single_line_loop_blocks = test_data_directory / 'two_single_line_loop_blocks.star' two_basic_blocks = test_data_directory / 'two_basic_blocks.star' empty_loop = test_data_directory / 'empty_loop.star' +basic_single_quote = test_data_directory / 'basic_single_quote.star' +basic_double_quote = test_data_directory / 'basic_double_quote.star' +loop_single_quote = test_data_directory / 'loop_single_quote.star' +loop_double_quote = test_data_directory / 'loop_double_quote.star' # Example DataFrame for testing cars = {'Brand': ['Honda_Civic', 'Toyota_Corolla', 'Ford_Focus', 'Audi_A4'], diff --git a/tests/data/basic_double_quote.star b/tests/data/basic_double_quote.star new file mode 100644 index 0000000..6db9dc5 --- /dev/null +++ b/tests/data/basic_double_quote.star @@ -0,0 +1,7 @@ +data_ + +_no_quote_string noquote +_quote_string "quote string" +_whitespace_string " " +_empty_string "" + diff --git a/tests/data/basic_single_quote.star b/tests/data/basic_single_quote.star new file mode 100644 index 0000000..3066d88 --- /dev/null +++ b/tests/data/basic_single_quote.star @@ -0,0 +1,7 @@ +data_ + +_no_quote_string noquote +_quote_string 'quote string' +_whitespace_string ' ' +_empty_string '' + diff --git a/tests/data/loop_double_quote.star b/tests/data/loop_double_quote.star new file mode 100644 index 0000000..e9d6730 --- /dev/null +++ b/tests/data/loop_double_quote.star @@ -0,0 +1,13 @@ +data_ + +loop_ +_no_quote_string #1 +_quote_string #2 +_whitespace_string #3 +_empty_string #4 +_number_and_string #5 +_number_and_empty #6 +_number #7 +_empty_string_and_normal_string #8 +noquote "quote string" " " "" 4.0 5.0 6.0 "" +noquote "quote string" " " "" noquote "" 7.0 test diff --git a/tests/data/loop_single_quote.star b/tests/data/loop_single_quote.star new file mode 100644 index 0000000..d0fcb78 --- /dev/null +++ b/tests/data/loop_single_quote.star @@ -0,0 +1,13 @@ +data_ + +loop_ +_no_quote_string #1 +_quote_string #2 +_whitespace_string #3 +_empty_string #4 +_number_and_string #5 +_number_and_empty #6 +_number #7 +_empty_string_and_normal_string #8 +noquote 'quote string' ' ' '' 4.0 5.0 6.0 '' +noquote 'quote string' ' ' '' noquote '' 7.0 test diff --git a/tests/test_parsing.py b/tests/test_parsing.py index dd6cad8..78f7c96 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -20,6 +20,10 @@ two_single_line_loop_blocks, two_basic_blocks, empty_loop, + basic_single_quote, + basic_double_quote, + loop_single_quote, + loop_double_quote, ) from .utils import generate_large_star_file, remove_large_star_file, million_row_file @@ -237,3 +241,37 @@ def test_empty_loop_block(): """Parsing an empty loop block should return an empty dataframe.""" parser = StarParser(empty_loop) assert len(parser.data_blocks) == 1 + + + +@pytest.mark.parametrize("quote_character, filename", [("'",basic_single_quote), + ('"',basic_double_quote), + ]) +def test_quote_basic(quote_character,filename): + import math + parser = StarParser(filename) + assert len(parser.data_blocks) == 1 + assert parser.data_blocks['']['no_quote_string'] == "noquote" + assert parser.data_blocks['']['quote_string'] == "quote string" + assert parser.data_blocks['']['whitespace_string'] == " " + assert parser.data_blocks['']['empty_string'] == "" + +@pytest.mark.parametrize("quote_character, filename", [("'",loop_single_quote), + ('"',loop_double_quote), + ]) +def test_quote_loop(quote_character,filename): + import math + parser = StarParser(filename) + assert len(parser.data_blocks) == 1 + assert parser.data_blocks[''].loc[0,'no_quote_string'] == "noquote" + assert parser.data_blocks[''].loc[0,'quote_string'] == "quote string" + assert parser.data_blocks[''].loc[0,'whitespace_string'] == " " + assert parser.data_blocks[''].loc[0,'empty_string'] == "" + + assert parser.data_blocks[''].dtypes['number_and_string'] == object + assert parser.data_blocks[''].dtypes['number_and_empty'] == 'float64' + assert parser.data_blocks[''].dtypes['number'] == 'float64' + assert parser.data_blocks[''].dtypes['empty_string_and_normal_string'] == object + + assert math.isnan(parser.data_blocks[''].loc[1,'number_and_empty']) + assert parser.data_blocks[''].loc[0,'empty_string_and_normal_string'] == '' diff --git a/tests/test_writing.py b/tests/test_writing.py index 819f285..24ec1a0 100644 --- a/tests/test_writing.py +++ b/tests/test_writing.py @@ -1,13 +1,16 @@ from os.path import join as join_path from tempfile import TemporaryDirectory +import time +import math import pandas as pd +import pytest from starfile.parser import StarParser from starfile.writer import StarWriter from .constants import loop_simple, postprocess, test_data_directory, test_df - +from .utils import generate_large_star_file, remove_large_star_file def test_write_simple_block(): s = StarParser(postprocess) @@ -68,3 +71,60 @@ def test_can_write_non_zero_indexed_one_row_dataframe(): "1\t2\t3" ) assert (expected in output) + + +@pytest.mark.parametrize("quote_character, quote_all_strings, num_quotes", + [('"', False, 6), + ('"', True, 8), + ("'", False, 6), + ("'", True, 8) + ]) +def test_string_quoting_loop_datablock(quote_character, quote_all_strings, num_quotes, tmp_path): + df = pd.DataFrame([[1,"nospace", "String with space", " ", ""]], + columns=["a_number","string_without_space", "string_space", "just_space", "empty_string"]) + + filename = tmp_path / "test.star" + StarWriter(df, filename, quote_character=quote_character, quote_all_strings=quote_all_strings) + + # Test for the appropriate number of quotes + with open(filename) as f: + star_content = f.read() + assert star_content.count(quote_character) == num_quotes + + s = StarParser(filename) + assert df.equals(s.data_blocks[""]) + +def test_writing_speed(): + start = time.time() + generate_large_star_file() + end = time.time() + remove_large_star_file() + + # Check that execution takes less than a second + assert end - start < 1 + +@pytest.mark.parametrize("quote_character, quote_all_strings, num_quotes", + [('"', False, 6), + ('"', True, 8), + ("'", False, 6), + ("'", True, 8) + ]) +def test_string_quoting_simple_datablock(quote_character, quote_all_strings,num_quotes, tmp_path): + o = { + "a_number": 1, + "string_without_space": "nospace", + "string_space": "String with space", + "just_space": " ", + "empty_string": "" + } + + filename = tmp_path / "test.star" + StarWriter(o, filename, quote_character=quote_character, quote_all_strings=quote_all_strings) + + # Test for the appropriate number of quotes + with open(filename) as f: + star_content = f.read() + assert star_content.count(quote_character) == num_quotes + + s = StarParser(filename) + assert o == s.data_blocks[""]