Skip to content

Commit

Permalink
Improved string quoting (#41)
Browse files Browse the repository at this point in the history
* Initial tests and support for other quotechars

* Quote parsing passes tests

* Test files

* Writer tests

* First iteration passes tests

* Added option to always quote strings

* remove unneeded option

* Add performance test for writing

* Renamed options and better empty string handeling
  • Loading branch information
jojoelfe authored Dec 5, 2023
1 parent 363aa91 commit 9f3fe21
Show file tree
Hide file tree
Showing 10 changed files with 187 additions and 10 deletions.
7 changes: 6 additions & 1 deletion starfile/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def read(filename: PathLike, read_n_blocks: int = None, always_dict: bool = Fals
default behaviour in the case of only one data block being present in the STAR file is to
return only a dataframe, this can be changed by setting 'always_dict=True'
"""

parser = StarParser(filename, n_blocks_to_read=read_n_blocks)
if len(parser.data_blocks) == 1 and always_dict is False:
return list(parser.data_blocks.values())[0]
Expand All @@ -35,6 +36,8 @@ def write(
float_format: str = '%.6f',
sep: str = '\t',
na_rep: str = '<NA>',
quote_character: str = '"',
quote_all_strings: bool = False,
**kwargs,
):
"""Write data blocks as STAR files."""
Expand All @@ -43,5 +46,7 @@ def write(
filename=filename,
float_format=float_format,
na_rep=na_rep,
separator=sep
separator=sep,
quote_character=quote_character,
quote_all_strings=quote_all_strings,
)
13 changes: 9 additions & 4 deletions starfile/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections import deque
from io import StringIO
from linecache import getline
import shlex

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -71,7 +72,7 @@ def _parse_simple_block(self) -> Dict[str, Union[str, int, float]]:
if self.current_line.startswith('data'):
break
elif self.current_line.startswith('_'): # '_foo bar'
k, v = self.current_line.split()
k, v = shlex.split(self.current_line)
block[k[1:]] = numericise(v)
self.current_line_number += 1
return block
Expand Down Expand Up @@ -103,12 +104,16 @@ def _parse_loop_block(self) -> pd.DataFrame:
df = pd.DataFrame(np.zeros(shape=(0, n_cols)))
else:
df = pd.read_csv(
StringIO(loop_data),
StringIO(loop_data.replace("'",'"')),
delim_whitespace=True,
header=None,
comment='#'
comment='#',
keep_default_na=False
)
df = df.apply(pd.to_numeric, errors='ignore')
df_numeric = df.apply(pd.to_numeric, errors='ignore')
# Replace columns that are all NaN with the original string columns
df_numeric.loc[:, df_numeric.isna().all()] = df.loc[:, df_numeric.isna().all()]
df = df_numeric
df.columns = loop_column_names
return df

Expand Down
33 changes: 29 additions & 4 deletions starfile/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path
from typing import TYPE_CHECKING, Union, Dict, List
from importlib.metadata import version
import csv

import pandas as pd

Expand All @@ -24,6 +25,8 @@ def __init__(
float_format: str = '%.6f',
separator: str = '\t',
na_rep: str = '<NA>',
quote_character: str = '"',
quote_all_strings: bool = False,
):
# coerce data
self.data_blocks = self.coerce_data_blocks(data_blocks)
Expand All @@ -33,6 +36,8 @@ def __init__(
self.float_format = float_format
self.sep = separator
self.na_rep = na_rep
self.quote_character = quote_character
self.quote_all_strings = quote_all_strings
self.buffer = TextBuffer()
self.backup_if_file_exists()
self.write()
Expand Down Expand Up @@ -67,7 +72,9 @@ def write_data_blocks(self):
write_simple_block(
file=self.filename,
block_name=block_name,
data=block
data=block,
quote_character=self.quote_character,
quote_all_strings=self.quote_all_strings
)
elif isinstance(block, pd.DataFrame):
write_loop_block(
Expand All @@ -77,6 +84,8 @@ def write_data_blocks(self):
float_format=self.float_format,
separator=self.sep,
na_rep=self.na_rep,
quote_character=self.quote_character,
quote_all_strings=self.quote_all_strings
)

def backup_if_file_exists(self):
Expand Down Expand Up @@ -123,13 +132,22 @@ def write_package_info(file: Path):
def write_simple_block(
file: Path,
block_name: str,
data: Dict[str, Union[str, int, float]]
):
data: Dict[str, Union[str, int, float]],
quote_character: str = '"',
quote_all_strings: bool = False
):
quoted_data = {
k: f"{quote_character}{v}{quote_character}"
if isinstance(v, str) and (quote_all_strings or " " in v or v == "")
else v
for k, v
in data.items()
}
formatted_lines = '\n'.join(
[
f'_{k}\t\t\t{v}'
for k, v
in data.items()
in quoted_data.items()
]
)
with open(file, mode='a') as f:
Expand All @@ -145,6 +163,8 @@ def write_loop_block(
float_format: str = '%.6f',
separator: str = '\t',
na_rep: str = '<NA>',
quote_character: str = '"',
quote_all_strings: bool = False
):
# write header
header_lines = [
Expand All @@ -158,6 +178,10 @@ def write_loop_block(
f.write('\n'.join(header_lines))
f.write('\n')

df = df.applymap(lambda x: f'{quote_character}{x}{quote_character}'
if isinstance(x, str) and (quote_all_strings or " " in x or x == "")
else x)

# write data
df.to_csv(
path_or_buf=file,
Expand All @@ -167,5 +191,6 @@ def write_loop_block(
index=False,
float_format=float_format,
na_rep=na_rep,
quoting=csv.QUOTE_NONE
)
write_blank_lines(file, n=2)
4 changes: 4 additions & 0 deletions tests/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@
two_single_line_loop_blocks = test_data_directory / 'two_single_line_loop_blocks.star'
two_basic_blocks = test_data_directory / 'two_basic_blocks.star'
empty_loop = test_data_directory / 'empty_loop.star'
basic_single_quote = test_data_directory / 'basic_single_quote.star'
basic_double_quote = test_data_directory / 'basic_double_quote.star'
loop_single_quote = test_data_directory / 'loop_single_quote.star'
loop_double_quote = test_data_directory / 'loop_double_quote.star'

# Example DataFrame for testing
cars = {'Brand': ['Honda_Civic', 'Toyota_Corolla', 'Ford_Focus', 'Audi_A4'],
Expand Down
7 changes: 7 additions & 0 deletions tests/data/basic_double_quote.star
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
data_

_no_quote_string noquote
_quote_string "quote string"
_whitespace_string " "
_empty_string ""

7 changes: 7 additions & 0 deletions tests/data/basic_single_quote.star
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
data_

_no_quote_string noquote
_quote_string 'quote string'
_whitespace_string ' '
_empty_string ''

13 changes: 13 additions & 0 deletions tests/data/loop_double_quote.star
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
data_

loop_
_no_quote_string #1
_quote_string #2
_whitespace_string #3
_empty_string #4
_number_and_string #5
_number_and_empty #6
_number #7
_empty_string_and_normal_string #8
noquote "quote string" " " "" 4.0 5.0 6.0 ""
noquote "quote string" " " "" noquote "" 7.0 test
13 changes: 13 additions & 0 deletions tests/data/loop_single_quote.star
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
data_

loop_
_no_quote_string #1
_quote_string #2
_whitespace_string #3
_empty_string #4
_number_and_string #5
_number_and_empty #6
_number #7
_empty_string_and_normal_string #8
noquote 'quote string' ' ' '' 4.0 5.0 6.0 ''
noquote 'quote string' ' ' '' noquote '' 7.0 test
38 changes: 38 additions & 0 deletions tests/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
two_single_line_loop_blocks,
two_basic_blocks,
empty_loop,
basic_single_quote,
basic_double_quote,
loop_single_quote,
loop_double_quote,
)
from .utils import generate_large_star_file, remove_large_star_file, million_row_file

Expand Down Expand Up @@ -237,3 +241,37 @@ def test_empty_loop_block():
"""Parsing an empty loop block should return an empty dataframe."""
parser = StarParser(empty_loop)
assert len(parser.data_blocks) == 1



@pytest.mark.parametrize("quote_character, filename", [("'",basic_single_quote),
('"',basic_double_quote),
])
def test_quote_basic(quote_character,filename):
import math
parser = StarParser(filename)
assert len(parser.data_blocks) == 1
assert parser.data_blocks['']['no_quote_string'] == "noquote"
assert parser.data_blocks['']['quote_string'] == "quote string"
assert parser.data_blocks['']['whitespace_string'] == " "
assert parser.data_blocks['']['empty_string'] == ""

@pytest.mark.parametrize("quote_character, filename", [("'",loop_single_quote),
('"',loop_double_quote),
])
def test_quote_loop(quote_character,filename):
import math
parser = StarParser(filename)
assert len(parser.data_blocks) == 1
assert parser.data_blocks[''].loc[0,'no_quote_string'] == "noquote"
assert parser.data_blocks[''].loc[0,'quote_string'] == "quote string"
assert parser.data_blocks[''].loc[0,'whitespace_string'] == " "
assert parser.data_blocks[''].loc[0,'empty_string'] == ""

assert parser.data_blocks[''].dtypes['number_and_string'] == object
assert parser.data_blocks[''].dtypes['number_and_empty'] == 'float64'
assert parser.data_blocks[''].dtypes['number'] == 'float64'
assert parser.data_blocks[''].dtypes['empty_string_and_normal_string'] == object

assert math.isnan(parser.data_blocks[''].loc[1,'number_and_empty'])
assert parser.data_blocks[''].loc[0,'empty_string_and_normal_string'] == ''
62 changes: 61 additions & 1 deletion tests/test_writing.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from os.path import join as join_path
from tempfile import TemporaryDirectory
import time
import math

import pandas as pd
import pytest

from starfile.parser import StarParser
from starfile.writer import StarWriter

from .constants import loop_simple, postprocess, test_data_directory, test_df

from .utils import generate_large_star_file, remove_large_star_file

def test_write_simple_block():
s = StarParser(postprocess)
Expand Down Expand Up @@ -68,3 +71,60 @@ def test_can_write_non_zero_indexed_one_row_dataframe():
"1\t2\t3"
)
assert (expected in output)


@pytest.mark.parametrize("quote_character, quote_all_strings, num_quotes",
[('"', False, 6),
('"', True, 8),
("'", False, 6),
("'", True, 8)
])
def test_string_quoting_loop_datablock(quote_character, quote_all_strings, num_quotes, tmp_path):
df = pd.DataFrame([[1,"nospace", "String with space", " ", ""]],
columns=["a_number","string_without_space", "string_space", "just_space", "empty_string"])

filename = tmp_path / "test.star"
StarWriter(df, filename, quote_character=quote_character, quote_all_strings=quote_all_strings)

# Test for the appropriate number of quotes
with open(filename) as f:
star_content = f.read()
assert star_content.count(quote_character) == num_quotes

s = StarParser(filename)
assert df.equals(s.data_blocks[""])

def test_writing_speed():
start = time.time()
generate_large_star_file()
end = time.time()
remove_large_star_file()

# Check that execution takes less than a second
assert end - start < 1

@pytest.mark.parametrize("quote_character, quote_all_strings, num_quotes",
[('"', False, 6),
('"', True, 8),
("'", False, 6),
("'", True, 8)
])
def test_string_quoting_simple_datablock(quote_character, quote_all_strings,num_quotes, tmp_path):
o = {
"a_number": 1,
"string_without_space": "nospace",
"string_space": "String with space",
"just_space": " ",
"empty_string": ""
}

filename = tmp_path / "test.star"
StarWriter(o, filename, quote_character=quote_character, quote_all_strings=quote_all_strings)

# Test for the appropriate number of quotes
with open(filename) as f:
star_content = f.read()
assert star_content.count(quote_character) == num_quotes

s = StarParser(filename)
assert o == s.data_blocks[""]

0 comments on commit 9f3fe21

Please sign in to comment.