Improved string quoting (#41)

* Initial tests and support for other quotechars * Quote parsing passes tests * Test files * Writer tests * First iteration passes tests * Added option to always quote strings * remove unneeded option * Add performance test for writing * Renamed options and better empty string handeling
teamtomo · Dec 5, 2023 · 9f3fe21 · 9f3fe21
1 parent 363aa91
commit 9f3fe21
Show file tree

Hide file tree

Showing 10 changed files with 187 additions and 10 deletions.
diff --git a/starfile/functions.py b/starfile/functions.py
@@ -22,6 +22,7 @@ def read(filename: PathLike, read_n_blocks: int = None, always_dict: bool = Fals
     default behaviour in the case of only one data block being present in the STAR file is to
     return only a dataframe, this can be changed by setting 'always_dict=True'
     """
+
     parser = StarParser(filename, n_blocks_to_read=read_n_blocks)
     if len(parser.data_blocks) == 1 and always_dict is False:
         return list(parser.data_blocks.values())[0]
@@ -35,6 +36,8 @@ def write(
     float_format: str = '%.6f',
     sep: str = '\t',
     na_rep: str = '<NA>',
+    quote_character: str = '"',
+    quote_all_strings: bool = False,
     **kwargs,
 ):
     """Write data blocks as STAR files."""
@@ -43,5 +46,7 @@ def write(
         filename=filename,
         float_format=float_format,
         na_rep=na_rep,
-        separator=sep
+        separator=sep,
+        quote_character=quote_character,
+        quote_all_strings=quote_all_strings,
     )
diff --git a/starfile/parser.py b/starfile/parser.py
@@ -3,6 +3,7 @@
 from collections import deque
 from io import StringIO
 from linecache import getline
+import shlex
 
 import numpy as np
 import pandas as pd
@@ -71,7 +72,7 @@ def _parse_simple_block(self) -> Dict[str, Union[str, int, float]]:
             if self.current_line.startswith('data'):
                 break
             elif self.current_line.startswith('_'):  # '_foo bar'
-                k, v = self.current_line.split()
+                k, v = shlex.split(self.current_line)
                 block[k[1:]] = numericise(v)
             self.current_line_number += 1
         return block
@@ -103,12 +104,16 @@ def _parse_loop_block(self) -> pd.DataFrame:
             df = pd.DataFrame(np.zeros(shape=(0, n_cols)))
         else:
             df = pd.read_csv(
-                StringIO(loop_data),
+                StringIO(loop_data.replace("'",'"')),
                 delim_whitespace=True,
                 header=None,
-                comment='#'
+                comment='#',
+                keep_default_na=False
             )
-            df = df.apply(pd.to_numeric, errors='ignore')
+            df_numeric = df.apply(pd.to_numeric, errors='ignore')
+            # Replace columns that are all NaN with the original string columns
+            df_numeric.loc[:, df_numeric.isna().all()] = df.loc[:, df_numeric.isna().all()]
+            df = df_numeric
             df.columns = loop_column_names
         return df
 

diff --git a/starfile/writer.py b/starfile/writer.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Union, Dict, List
 from importlib.metadata import version
+import csv
 
 import pandas as pd
 
@@ -24,6 +25,8 @@ def __init__(
         float_format: str = '%.6f',
         separator: str = '\t',
         na_rep: str = '<NA>',
+        quote_character: str = '"',
+        quote_all_strings: bool = False,
     ):
         # coerce data
         self.data_blocks = self.coerce_data_blocks(data_blocks)
@@ -33,6 +36,8 @@ def __init__(
         self.float_format = float_format
         self.sep = separator
         self.na_rep = na_rep
+        self.quote_character = quote_character
+        self.quote_all_strings = quote_all_strings
         self.buffer = TextBuffer()
         self.backup_if_file_exists()
         self.write()
@@ -67,7 +72,9 @@ def write_data_blocks(self):
                 write_simple_block(
                     file=self.filename,
                     block_name=block_name,
-                    data=block
+                    data=block,
+                    quote_character=self.quote_character,
+                    quote_all_strings=self.quote_all_strings
                 )
             elif isinstance(block, pd.DataFrame):
                 write_loop_block(
@@ -77,6 +84,8 @@ def write_data_blocks(self):
                     float_format=self.float_format,
                     separator=self.sep,
                     na_rep=self.na_rep,
+                    quote_character=self.quote_character,
+                    quote_all_strings=self.quote_all_strings
                 )
 
     def backup_if_file_exists(self):
@@ -123,13 +132,22 @@ def write_package_info(file: Path):
 def write_simple_block(
     file: Path,
     block_name: str,
-    data: Dict[str, Union[str, int, float]]
-):
+    data: Dict[str, Union[str, int, float]],
+    quote_character: str = '"',
+    quote_all_strings: bool = False
+):  
+    quoted_data = {
+        k: f"{quote_character}{v}{quote_character}" 
+        if isinstance(v, str) and (quote_all_strings or " " in v or v == "") 
+        else v
+        for k, v
+        in data.items()    
+    }
     formatted_lines = '\n'.join(
         [
             f'_{k}\t\t\t{v}'
             for k, v
-            in data.items()
+            in quoted_data.items()
         ]
     )
     with open(file, mode='a') as f:
@@ -145,6 +163,8 @@ def write_loop_block(
     float_format: str = '%.6f',
     separator: str = '\t',
     na_rep: str = '<NA>',
+    quote_character: str = '"',
+    quote_all_strings: bool = False
 ):
     # write header
     header_lines = [
@@ -158,6 +178,10 @@ def write_loop_block(
         f.write('\n'.join(header_lines))
         f.write('\n')
 
+    df = df.applymap(lambda x: f'{quote_character}{x}{quote_character}' 
+                     if isinstance(x, str) and (quote_all_strings or " " in x or x == "") 
+                     else x)
+
     # write data
     df.to_csv(
         path_or_buf=file,
@@ -167,5 +191,6 @@ def write_loop_block(
         index=False,
         float_format=float_format,
         na_rep=na_rep,
+        quoting=csv.QUOTE_NONE
     )
     write_blank_lines(file, n=2)
diff --git a/tests/constants.py b/tests/constants.py
@@ -21,6 +21,10 @@
 two_single_line_loop_blocks = test_data_directory / 'two_single_line_loop_blocks.star'
 two_basic_blocks = test_data_directory / 'two_basic_blocks.star'
 empty_loop = test_data_directory / 'empty_loop.star'
+basic_single_quote = test_data_directory / 'basic_single_quote.star'
+basic_double_quote = test_data_directory / 'basic_double_quote.star'
+loop_single_quote = test_data_directory / 'loop_single_quote.star'
+loop_double_quote = test_data_directory / 'loop_double_quote.star'
 
 # Example DataFrame for testing
 cars = {'Brand': ['Honda_Civic', 'Toyota_Corolla', 'Ford_Focus', 'Audi_A4'],

diff --git a/tests/data/basic_double_quote.star b/tests/data/basic_double_quote.star
@@ -0,0 +1,7 @@
+data_
+
+_no_quote_string           noquote
+_quote_string              "quote string"
+_whitespace_string         " "
+_empty_string              ""
+
diff --git a/tests/data/basic_single_quote.star b/tests/data/basic_single_quote.star
@@ -0,0 +1,7 @@
+data_
+
+_no_quote_string           noquote
+_quote_string              'quote string'
+_whitespace_string         ' '
+_empty_string              ''
+
diff --git a/tests/data/loop_double_quote.star b/tests/data/loop_double_quote.star
@@ -0,0 +1,13 @@
+data_
+
+loop_
+_no_quote_string #1
+_quote_string #2
+_whitespace_string #3
+_empty_string #4
+_number_and_string #5
+_number_and_empty #6
+_number #7
+_empty_string_and_normal_string #8
+noquote "quote string" " " "" 4.0 5.0 6.0 ""
+noquote "quote string" " " "" noquote "" 7.0 test
diff --git a/tests/data/loop_single_quote.star b/tests/data/loop_single_quote.star
@@ -0,0 +1,13 @@
+data_
+
+loop_
+_no_quote_string #1
+_quote_string #2
+_whitespace_string #3
+_empty_string #4
+_number_and_string #5
+_number_and_empty #6
+_number #7
+_empty_string_and_normal_string #8
+noquote 'quote string' ' ' '' 4.0 5.0 6.0 ''
+noquote 'quote string' ' ' '' noquote '' 7.0 test
diff --git a/tests/test_parsing.py b/tests/test_parsing.py
@@ -20,6 +20,10 @@
     two_single_line_loop_blocks,
     two_basic_blocks,
     empty_loop,
+    basic_single_quote,
+    basic_double_quote,
+    loop_single_quote,
+    loop_double_quote,
 )
 from .utils import generate_large_star_file, remove_large_star_file, million_row_file
 
@@ -237,3 +241,37 @@ def test_empty_loop_block():
     """Parsing an empty loop block should return an empty dataframe."""
     parser = StarParser(empty_loop)
     assert len(parser.data_blocks) == 1
+
+
+
+@pytest.mark.parametrize("quote_character, filename", [("'",basic_single_quote), 
+                                                 ('"',basic_double_quote), 
+                                                 ])
+def test_quote_basic(quote_character,filename):
+    import math
+    parser = StarParser(filename)
+    assert len(parser.data_blocks) == 1
+    assert parser.data_blocks['']['no_quote_string'] == "noquote"
+    assert parser.data_blocks['']['quote_string'] == "quote string"
+    assert parser.data_blocks['']['whitespace_string'] == " "
+    assert parser.data_blocks['']['empty_string'] == ""
+
+@pytest.mark.parametrize("quote_character, filename", [("'",loop_single_quote), 
+                                                 ('"',loop_double_quote), 
+                                                 ])
+def test_quote_loop(quote_character,filename):
+    import math
+    parser = StarParser(filename)
+    assert len(parser.data_blocks) == 1
+    assert parser.data_blocks[''].loc[0,'no_quote_string'] == "noquote"
+    assert parser.data_blocks[''].loc[0,'quote_string'] == "quote string"
+    assert parser.data_blocks[''].loc[0,'whitespace_string'] == " "
+    assert parser.data_blocks[''].loc[0,'empty_string'] == ""
+
+    assert parser.data_blocks[''].dtypes['number_and_string'] == object
+    assert parser.data_blocks[''].dtypes['number_and_empty'] == 'float64'
+    assert parser.data_blocks[''].dtypes['number'] == 'float64'
+    assert parser.data_blocks[''].dtypes['empty_string_and_normal_string'] == object
+
+    assert math.isnan(parser.data_blocks[''].loc[1,'number_and_empty'])
+    assert parser.data_blocks[''].loc[0,'empty_string_and_normal_string'] == ''
diff --git a/tests/test_writing.py b/tests/test_writing.py
@@ -1,13 +1,16 @@
 from os.path import join as join_path
 from tempfile import TemporaryDirectory
+import time
+import math 
 
 import pandas as pd
+import pytest
 
 from starfile.parser import StarParser
 from starfile.writer import StarWriter
 
 from .constants import loop_simple, postprocess, test_data_directory, test_df
-
+from .utils import generate_large_star_file, remove_large_star_file
 
 def test_write_simple_block():
     s = StarParser(postprocess)
@@ -68,3 +71,60 @@ def test_can_write_non_zero_indexed_one_row_dataframe():
         "1\t2\t3"
     )
     assert (expected in output)
+
+
+@pytest.mark.parametrize("quote_character, quote_all_strings, num_quotes", 
+                         [('"', False, 6),
+                          ('"', True, 8),
+                          ("'", False, 6),
+                          ("'", True, 8)
+                         ])
+def test_string_quoting_loop_datablock(quote_character, quote_all_strings, num_quotes, tmp_path):
+    df = pd.DataFrame([[1,"nospace", "String with space", " ", ""]],
+                       columns=["a_number","string_without_space", "string_space", "just_space", "empty_string"])
+
+    filename = tmp_path / "test.star"
+    StarWriter(df, filename, quote_character=quote_character, quote_all_strings=quote_all_strings)
+
+    # Test for the appropriate number of quotes
+    with open(filename) as f:
+        star_content = f.read()
+        assert star_content.count(quote_character) == num_quotes
+
+    s = StarParser(filename)
+    assert df.equals(s.data_blocks[""])
+
+def test_writing_speed():
+    start = time.time()
+    generate_large_star_file()
+    end = time.time()
+    remove_large_star_file()
+
+    # Check that execution takes less than a second
+    assert end - start < 1
+
+@pytest.mark.parametrize("quote_character, quote_all_strings, num_quotes", 
+                         [('"', False, 6),
+                          ('"', True, 8),
+                          ("'", False, 6),
+                          ("'", True, 8)
+                         ])
+def test_string_quoting_simple_datablock(quote_character, quote_all_strings,num_quotes, tmp_path):
+    o = {
+        "a_number": 1,
+        "string_without_space": "nospace",
+        "string_space": "String with space",
+        "just_space": " ",
+        "empty_string": ""
+    }
+
+    filename = tmp_path / "test.star"
+    StarWriter(o, filename, quote_character=quote_character, quote_all_strings=quote_all_strings)
+
+    # Test for the appropriate number of quotes
+    with open(filename) as f:
+        star_content = f.read()
+        assert star_content.count(quote_character) == num_quotes
+
+    s = StarParser(filename)
+    assert o == s.data_blocks[""]