Initial commit

Z4JC · Sep 2, 2024 · c270f0c · c270f0c
commit c270f0c
Show file tree

Hide file tree

Showing 11 changed files with 631 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+.DS_Store
+.pytest_cache
+__pycache__
+venv
+*.c
+*.egg-info
+*.so
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,10 @@
+MIT License
+
+Copyright © 2024 !ZAJC!/GDS
+Portions Copyright © 2005-2020 Rich Felker, et al.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next paragraph) shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,59 @@
+## Overview
+Use `uwcwidth` when you want to very quickly find out how many characters a Unicode string takes up in your terminal.
+
+For example, `uwcwidth.wcswidth('Hello🥹')` returns `7` because  your terminal will use 5 places for "Hello" and then 2 places for the "🥹" emoji.
+
+`uwcwidth` is designed to run as fast as standard built-in Python string operations and use a tiny amount of memory.
+
+
+## Installation
+
+```sh
+pip install uwcwidth
+```
+
+## Isn't this easy?
+
+Let's take a look at "👩‍🦯‍➡️":
+
+While  `len('\U0001F469\u200d\U0001F9AF\u200d\u27a1\ufe0f')` returns `6` because this string has 6 unicode codepoints, we are looking at a single emoji "👩‍🦯‍➡️". This emoji occupies 2 characters in your terminal. Here I am assuming your terminal knows how to deal with the special Zero-Width Joiner (U+200D) and the Variation Selector-16 Emoji (U+FE0F). Things get worse as there are special modifiers for skin tone, which can be either invisible or standalone characters, etc. Also, you have to deal with other languages and their scripts, etc.
+
+## Correctness
+`uwcwidth` should work fine on various scripts such as Cyrillic, Katakana,  and also Emojis in Unicode v16.0. This includes Emojis that use Variation Selector 15 and 16, Zero Width Joiner, Emoji Modifiers for skin type, etc. Overall, `uwcwidth` is probably more accurate than whatever is currently shipping with your OS and whatever your terminal is using in 2024.
+
+Some edge cases that break other `wc(s)width` libraries and a lot of terminals:
+
+```python3
+from uwcwidth import wcswidth
+# Should be 8 terminal chars: 🏃🏾‍♂️=🏃️🏾♂
+assert wcswidth('\U0001F3C3\U0001F3FE\u200d\u2642\ufe0f'
+                '=\U0001F3C3\ufe0f\U0001F3FE\u2642') == 8
+# Should be 5 terminal chars: ⛹🏼🏴󠁧󠁢󠁳󠁣󠁴󠁿!
+assert wcswidth('\u26f9\U0001F3FC'
+                '\U0001F3F4\U000E0067\U000E0062\U000E0073'
+                '\U000E0063\U000E0074\U000E007F!') == 5
+```
+
+See the `tests` folder for more.
+
+## Tiny footprint and code
+`uwcwidth` reserves around 4 KB of memory for its lookup tables. Parts of the storage scheme are derived from an older `wcwidth` implementation in [musl libc](https://musl.libc.org/). Generally sparse or dense bitmaps are used to look things up.
+The `uwcwidth.pyx` file is under 100 lines of code, with comments and whitespace.
+
+## Performance: 30x faster than `wcwidth`
+`uwcwidth` is about 30 times faster than the popular, well-documented and highly tested [wcwidth](https://github.com/jquast/wcwidth) library, while maintaining similar accuracy. It's also 5 times faster than `cwcwidth`, which does not work on new Emojis and breaks on some other edge cases.
+
+```python3
+In [1]: import wcwidth, cwcwidth, uwcwidth
+In [2]: %%timeit
+   ...: wcwidth.wcswidth("コンニチハ, セカイ!")
+1.28 μs ± 6.22 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
+
+In [3]: %%timeit
+   ...: cwcwidth.wcswidth("コンニチハ, セカイ!")
+205 ns ± 0.408 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
+
+In [4]: %%timeit
+   ...: uwcwidth.wcswidth("コンニチハ, セカイ!")
+38.5 ns ± 0.29 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,25 @@
+[project]
+name = "uwcwidth"
+version = "0.9"
+authors = [{name = "!ZAJC!"}]
+readme = "README.md"
+description = "terminal width of Unicode 16.0+Emoji strings in nanoseconds"
+license = {file = "LICENSE"}
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Topic :: Software Development :: Libraries :: Python Modules"
+]
+
+[project.optional-dependencies]
+test = ["pytest"]
+
+[build-system]
+requires = ["setuptools", "wheel", "Cython"]
+build-backend = "setuptools.build_meta"
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+addopts = ["--import-mode=importlib"]
diff --git a/setup.py b/setup.py
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: MIT
+from setuptools import setup, Extension
+
+setup(
+    name='uwcwidth',
+    ext_modules=[Extension("uwcwidth.uwcwidth",
+                           sources=["uwcwidth/uwcwidth.pyx"])],
+    package_data={'uwcwidth': ['__init__.pxd', 'uwcwidth.pxd', 'tables.pxd']}
+)
diff --git a/tests/test_uwcwidth.py b/tests/test_uwcwidth.py
@@ -0,0 +1,242 @@
+import pytest
+
+from uwcwidth import wcswidth, wcwidth, wcwidth_uint32, is_EMB, is_EMB_uint32
+
+
+class TestSimple:
+    def test_empty(self):
+        assert wcswidth('') == 0
+
+    def test_wcwidth_only_one_codepoint(self):
+        with pytest.raises(ValueError):
+            wcwidth('')
+
+        with pytest.raises(ValueError):
+            wcwidth('ab')
+
+    def test_simple_ascii(self):
+        assert wcswidth('Hello!') == 6
+        assert wcswidth('!') == 1
+        assert wcswidth('a~@!#~Z') == 7
+
+        assert wcwidth('!') == 1
+        assert wcwidth_uint32(ord('!')) == 1
+
+        assert wcwidth('a') == 1
+        assert wcwidth_uint32(ord('a')) == 1
+
+    def test_ascii_nonprintables(self):
+        assert wcwidth('\001') == -1
+        assert wcwidth('\033') == -1
+        assert wcwidth('\r') == -1
+        assert wcwidth('\n') == -1
+        assert wcwidth('\000') == 0
+
+        assert wcswidth('Hi\tthere') == -1
+        assert wcswidth('There is an \033') == -1
+        assert wcswidth('\001') == -1
+        assert wcswidth('\000') == 0
+        assert wcswidth('Hi \000there\000') == 8
+
+
+class TestAccents:
+    def test_latin1_chars(self):
+        assert wcswidth('m\u00fcller') == 6
+
+    def test_cafe(self):
+        assert wcswidth('cafe\u0301') == 4
+
+    def test_accute_accent(self):
+        assert wcwidth('\u0301') == 0
+        assert wcswidth('\u0301') == 0
+
+
+class TestNarrowAndWide:
+    def test_right_triangle(self):
+        assert wcwidth('\u22bf') == 1
+
+    def test_turned_not_sign(self):
+        assert wcwidth('\u2319') == 1
+
+    def test_watch_hourglass(self):
+        assert wcwidth('\u231a') == 2
+        assert wcwidth('\u231b') == 2
+
+    def test_warning_sign(self):
+        assert wcwidth('\u26a0') == 1
+
+    def test_heavy_plus_sign(self):
+        assert wcwidth('\u2795') == 2
+
+    def test_ideographic_half_fill_space(self):
+        assert wcwidth('\u303f') == 1
+
+    def test_hiragana_small_a(self):
+        assert wcwidth('\u3041') == 2
+
+    def test_segmented_digit_three(self):
+        assert wcwidth('\U0001FBF3') == 1
+
+    def test_katakana_small_ha(self):
+        assert wcwidth('\u31f5') == 2
+
+    def test_cjk_tiger(self):
+        assert wcwidth('\u4e54') == 2
+
+    def test_hello_world_jp(self):
+        assert wcswidth('コンニチハ, セカイ!') == 19
+
+    def test_hello_world_jp_offset(self):
+        assert wcswidth('コンニチハ, セカイ!', n=7) == 12
+
+
+class TestBasicEmojis:
+    def test_smiley(self):
+        assert wcwidth('\U0001F600') == 2
+
+    def test_hi_wave(self):
+        assert wcswidth('Hi\U0001F44B!') == 5
+
+    def test_shaking_face(self):
+        assert wcwidth('\U0001FAE8') == 2
+
+
+class TestFlagSequences:
+    def test_french_polynesia(self):
+        assert wcswidth('\U0001F1F5\U0001F1EB') == 2
+
+    def test_diego_garcia(self):
+        assert wcswidth('This is the \U0001F1E9\U0001F1EC island') == 21
+
+    def test_scotland(self):
+        assert wcswidth('\U0001F3F4\U000E0067\U000E0062\U000E0073'
+                        '\U000E0063\U000E0074\U000E007F') == 2
+
+
+class TestEmojiVS15VS16:
+    def test_vs16_0length(self):
+        assert wcwidth('\ufe0e') == 0
+        assert wcswidth('\ufe0e') == 0
+        assert wcwidth('\ufe0f') == 0
+        assert wcswidth('\ufe0f') == 0
+
+    def test_warn_emoji(self):
+        assert wcwidth('\u26a0') == 1
+        assert wcswidth('\u26a0') == 1
+        assert wcswidth('\u26a0\ufe0f') == 2
+        assert wcswidth('\u26a0\ufe0e') == 1
+        assert wcswidth('This is a \u26a0 warning!') == 20
+        assert wcswidth('This is a \u26a0\ufe0e warning!') == 20
+        assert wcswidth('This is a \u26a0\ufe0f warning!') == 21
+
+    def test_diamond_suit(self):
+        assert wcwidth('\u2666') == 1
+        assert wcswidth('\u2666\ufe0f') == 2
+        assert wcswidth('\u2666\ufe0e') == 1
+
+    def test_person_bouncing_ball(self):
+        assert wcswidth('\u26f9') == 1
+        assert wcswidth('\u26f9\ufe0f') == 2
+        assert wcswidth('\u26f9\ufe0e') == 1
+
+    def test_no_op_on_poodle(self):
+        assert wcswidth('\U0001F429') == 2
+        assert wcswidth('\U0001F429\ufe0f') == 2
+
+
+class TestKeycaps:
+    def test_five(self):
+        assert wcswidth('5\ufe0f\u20e3') == 2
+        assert wcswidth('5\ufe0f\u20e3 + 1 = 6') == 10
+
+
+class TestEmojiModifier:
+    def test_em_fitzpatrick_type_3(self):
+        assert wcwidth('\U0001F3FC') == 2
+        assert is_EMB('\U0001F3FC') == False
+        assert is_EMB_uint32(ord('\U0001F3FC')) == False
+        assert wcswidth('\U0001F3FC is a skin tone!') == 18
+
+    def test_does_not_modify_postbox(self):
+        assert wcwidth('\U0001F4EE') == 2
+        assert is_EMB('\U0001F4EE') == False
+        assert wcswidth('\U0001F4EE\U0001F3FE') == 4
+
+    def test_modifies_thumb_up(self):
+        assert is_EMB('\U0001F44D') == True
+        assert is_EMB_uint32(ord('\U0001F44D')) == True
+        assert wcswidth('\U0001F44D\U0001F3FF') == 2
+
+    def test_does_not_modify_smiley(self):
+        assert wcswidth('\U0001F600\U0001F3FB') == 4
+
+    def test_modifies_and_emojifies_person_bouncing_ball(self):
+        assert wcwidth('\u26f9') == 1
+        assert is_EMB('\u26f9') == True
+        assert is_EMB_uint32(ord('\u26f9')) == True
+        assert wcswidth('\u26f9\U0001F3FC') == 2
+        assert wcswidth('\u26f9\U0001F3FC', n=1) == 1
+
+    def test_does_not_modify_person_bouncing_ball_vs16(self):
+        assert wcswidth('\u26f9\ufe0f\U0001F3FD') == 4
+
+    def test_does_not_modify_person_bouncing_ball_vs15(self):
+        assert wcswidth('\u26f9\ufe0e\U0001F3FE') == 3
+
+    def test_person_bouncing_ball_explained(self):
+        assert wcswidth('\u26f9\U0001F3FF=\u26f9\ufe0e\U0001F3FF') == 6
+
+    def test_does_not_modify_tent(self):
+        assert wcwidth('\u26fa') == 2
+        assert is_EMB('\u26fa') == False
+        assert wcswidth('\u26fa\U0001F3FB') == 4
+
+    def test_does_not_also_modify_older_chars(self):
+        assert wcswidth('\u26f9\U0001F44D\U0001F3FC') == 3
+        assert wcswidth('\u26f9\U0001F44D\U0001F3FC', n=1) == 1
+        assert wcswidth('\u26f9\U0001F44D\U0001F3FC', n=2) == 3
+
+
+class TestEmojiZWJ:
+    def test_woman_with_white_cane(self):
+        assert wcswidth('\U0001F469\u200d\U0001F9AF') == 2
+
+    def test_woman_with_white_cane_facing_right(self):
+        assert wcswidth('\U0001F469\u200d\U0001F9AF\u200d\u27a1') == 2
+
+    def test_woman_with_white_cane_facing_right_fully_qualified(self):
+        assert wcswidth('\U0001F469\u200d\U0001F9AF\u200d\u27a1\ufe0f') == 2
+
+    def test_couple_with_heart_woman_man(self):
+        assert wcswidth('\U0001F469\u200d\u2764\u200d\U0001F468') == 2
+        assert wcswidth('\U0001F469\u200d\u2764\ufe0f\u200d\U0001F468') == 2
+
+    def test_man_running_medium_dark_skin_tone(self):
+        assert wcswidth('\U0001F3C3\U0001F3FE\u200d\u2642\ufe0f') == 2
+        assert wcswidth('\U0001F3C3\U0001F3FE\u200d\u2642') == 2
+
+    def test_kiss_person_person_medium_skin_tone_medium_dark_skin_tone(self):
+        assert wcswidth('\U0001F9D1\U0001F3FD\u200d\u2764\u200d'
+                        '\U0001F48B\u200d\U0001F9D1\U0001F3FE') == 2
+        assert wcswidth('\U0001F9D1\U0001F3FD\u200d\u2764\ufe0f\u200d'
+                        '\U0001F48B\u200d\U0001F9D1\U0001F3FE') == 2
+
+    def test_family_man_man_girl_girl(self):
+        assert wcswidth('\U0001F468\u200d\U0001F468\u200d'
+                        '\U0001F467\u200d\U0001F467') == 2
+
+
+class TestHard:
+    def test_man_running_explained(self):
+        assert wcswidth('\U0001F3C3\U0001F3FE\u200d\u2642'
+                        '=\U0001F3FE\U0001F3C3\u2642') == 8
+        assert wcswidth('\U0001F3C3\U0001F3FE\u200d\u2642\ufe0f'
+                        '=\U0001F3C3\ufe0f\U0001F3FE\u2642') == 8
+
+    def test_person_playing_ball_for_scotland(self):
+        assert wcswidth('\u26f9\U0001F3FC'
+                        '\U0001F3F4\U000E0067\U000E0062\U000E0073'
+                        '\U000E0063\U000E0074\U000E007F!') == 5
+        assert wcswidth('\u26f9'
+                        '\U0001F3F4\U000E0067\U000E0062\U000E0073'
+                        '\U000E0063\U000E0074\U000E007F!') == 4
diff --git a/uwcwidth/__init__.pxd b/uwcwidth/__init__.pxd
@@ -0,0 +1,4 @@
+#cython: language_level=3
+# SPDX-License-Identifier: MIT
+from uwcwidth.uwcwidth cimport (wcwidth, wcswidth, wcwidth_uint32
+                                is_EMB, is_EMB_uint32)
diff --git a/uwcwidth/__init__.py b/uwcwidth/__init__.py
@@ -0,0 +1,2 @@
+from .uwcwidth import (wcwidth, wcswidth, wcwidth_uint32,
+                       is_EMB, is_EMB_uint32)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .uwcwidth import (wcwidth, wcswidth, wcwidth_uint32,
		is_EMB, is_EMB_uint32)