From 7f5d912be61215370ea613f4439cb8ba951862d4 Mon Sep 17 00:00:00 2001 From: Fabio Zadrozny Date: Sat, 16 Sep 2017 10:35:21 -0300 Subject: [PATCH] Utilities to convert to/from utf-8, utf-16. Signed-off-by: Fabio Zadrozny --- .../core/internal/oniguruma/OnigString.java | 94 ++++++++++++++++++- .../org/eclipse/tm4e/core/OnigStringTest.java | 55 +++++++++++ 2 files changed, 146 insertions(+), 3 deletions(-) create mode 100644 org.eclipse.tm4e.core/src/test/java/org/eclipse/tm4e/core/OnigStringTest.java diff --git a/org.eclipse.tm4e.core/src/main/java/org/eclipse/tm4e/core/internal/oniguruma/OnigString.java b/org.eclipse.tm4e.core/src/main/java/org/eclipse/tm4e/core/internal/oniguruma/OnigString.java index 448566144..53c3ea18f 100644 --- a/org.eclipse.tm4e.core/src/main/java/org/eclipse/tm4e/core/internal/oniguruma/OnigString.java +++ b/org.eclipse.tm4e.core/src/main/java/org/eclipse/tm4e/core/internal/oniguruma/OnigString.java @@ -13,11 +13,15 @@ * - GitHub Inc.: Initial code, written in JavaScript, licensed under MIT license * - Angelo Zerr - translation and adaptation to Java * - Fabio Zadrozny - Convert uniqueId to Object (for identity compare) + * - Fabio Zadrozny - Utilities to convert between utf-8 and utf-16 */ package org.eclipse.tm4e.core.internal.oniguruma; import java.nio.charset.Charset; +import java.util.Arrays; + +import org.jcodings.specific.UTF8Encoding; /** * Oniguruma string. @@ -30,8 +34,12 @@ public class OnigString { private static final String UTF_8 = "UTF-8"; private final String str; - private byte[] value; - private Object uniqueId; + private final Object uniqueId; + private final byte[] value; + + private int[] charsPosFromBytePos; + private boolean computedOffsets; + public OnigString(String str) { this.str = str; @@ -51,10 +59,90 @@ public byte[] utf8_value() { } public int utf8_length() { - return str.length(); + return value.length; } public String getString() { return str; } + + public int convertUtf16OffsetToUtf8(int posInChars) { + if(!computedOffsets) { + computeOffsets(); + } + if (charsPosFromBytePos == null) { + // Same conditions as code below, but taking into account that the + // bytes and chars len are the same. + if (posInChars < 0 || this.value.length == 0 || posInChars > this.value.length) { + throw new ArrayIndexOutOfBoundsException(posInChars); + } + return posInChars; + } + + int[] charsLenInBytes = charsPosFromBytePos; + if (posInChars < 0 || charsLenInBytes.length == 0) { + throw new ArrayIndexOutOfBoundsException(posInChars); + } + if (posInChars == 0) { + return 0; + } + + int last = charsLenInBytes[charsLenInBytes.length - 1]; + if (last < posInChars) { + if (last == posInChars - 1) { + return charsLenInBytes.length; + } else { + throw new ArrayIndexOutOfBoundsException(posInChars); + } + } + + int index = Arrays.binarySearch(charsLenInBytes, posInChars); + while (index > 0) { + if (charsLenInBytes[index - 1] == posInChars) { + index--; + } else { + break; + } + } + return index; + } + + public int convertUtf8OffsetToUtf16(int posInBytes) { + if(!computedOffsets) { + computeOffsets(); + } + if (charsPosFromBytePos == null) { + return posInBytes; + } + if (posInBytes < 0) { + return posInBytes; + } + if (posInBytes >= charsPosFromBytePos.length) { + //One off can happen when finding the end of a regexp (it's the right boundary). + return charsPosFromBytePos[posInBytes - 1] + 1; + } + return charsPosFromBytePos[posInBytes]; + } + + private void computeOffsets() { + if (this.value.length != this.str.length()) { + charsPosFromBytePos = new int[this.value.length]; + int bytesLen = 0;; + int charsLen = 0; + int length = this.value.length; + for (int i = 0; i < length;) { + int codeLen = UTF8Encoding.INSTANCE.length(this.value, i, length); + for (int i1 = 0; i1 < codeLen; i1++) { + charsPosFromBytePos[bytesLen + i1] = charsLen; + } + bytesLen += codeLen; + i += codeLen; + charsLen += 1; + } + if(bytesLen != this.value.length) { + throw new AssertionError(bytesLen + " != "+this.value.length); + } + } + computedOffsets = true; + } } diff --git a/org.eclipse.tm4e.core/src/test/java/org/eclipse/tm4e/core/OnigStringTest.java b/org.eclipse.tm4e.core/src/test/java/org/eclipse/tm4e/core/OnigStringTest.java new file mode 100644 index 000000000..4973e8478 --- /dev/null +++ b/org.eclipse.tm4e.core/src/test/java/org/eclipse/tm4e/core/OnigStringTest.java @@ -0,0 +1,55 @@ +package org.eclipse.tm4e.core; + +import org.eclipse.tm4e.core.internal.oniguruma.OnigString; +import org.junit.Assert; +import org.junit.Test; + +public class OnigStringTest { + + @Test + public void testUtf8Utf16Conversions() { + OnigString onigString = new OnigString("áé"); + Assert.assertEquals(onigString.utf8_length(), 4); + Assert.assertEquals(onigString.getString().length(), 2); + Assert.assertEquals(onigString.convertUtf8OffsetToUtf16(0), 0); + } + + @Test + public void testUtf8Utf16Conversions2() { + + String string = "myááçóúôõaab"; + OnigString utf8WithCharLen = new OnigString(string); + + Assert.assertEquals(0, utf8WithCharLen.convertUtf16OffsetToUtf8(0)); + Assert.assertEquals(1, utf8WithCharLen.convertUtf16OffsetToUtf8(1)); + Assert.assertEquals(2, utf8WithCharLen.convertUtf16OffsetToUtf8(2)); + Assert.assertEquals(4, utf8WithCharLen.convertUtf16OffsetToUtf8(3)); + Assert.assertEquals(6, utf8WithCharLen.convertUtf16OffsetToUtf8(4)); + Assert.assertEquals(8, utf8WithCharLen.convertUtf16OffsetToUtf8(5)); + Assert.assertEquals(10, utf8WithCharLen.convertUtf16OffsetToUtf8(6)); + Assert.assertEquals(12, utf8WithCharLen.convertUtf16OffsetToUtf8(7)); + try { + utf8WithCharLen.convertUtf16OffsetToUtf8(55); + Assert.fail("Expected error"); + } catch (ArrayIndexOutOfBoundsException e) { + } + + Assert.assertEquals(0, utf8WithCharLen.convertUtf8OffsetToUtf16(0)); + Assert.assertEquals(1, utf8WithCharLen.convertUtf8OffsetToUtf16(1)); + Assert.assertEquals(2, utf8WithCharLen.convertUtf8OffsetToUtf16(2)); + Assert.assertEquals(2, utf8WithCharLen.convertUtf8OffsetToUtf16(3)); + Assert.assertEquals(3, utf8WithCharLen.convertUtf8OffsetToUtf16(4)); + Assert.assertEquals(3, utf8WithCharLen.convertUtf8OffsetToUtf16(5)); + Assert.assertEquals(4, utf8WithCharLen.convertUtf8OffsetToUtf16(6)); + Assert.assertEquals(4, utf8WithCharLen.convertUtf8OffsetToUtf16(7)); + Assert.assertEquals(5, utf8WithCharLen.convertUtf8OffsetToUtf16(8)); + Assert.assertEquals(6, utf8WithCharLen.convertUtf8OffsetToUtf16(10)); + Assert.assertEquals(7, utf8WithCharLen.convertUtf8OffsetToUtf16(12)); + try { + utf8WithCharLen.convertUtf8OffsetToUtf16(55); + Assert.fail("Expected error"); + } catch (ArrayIndexOutOfBoundsException e) { + } + + } +}