-
Notifications
You must be signed in to change notification settings - Fork 57
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #146 from fabioz/master
Utilities to convert to/from utf-8, utf-16.
- Loading branch information
Showing
2 changed files
with
146 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,10 +13,14 @@ | |
* - GitHub Inc.: Initial code, written in JavaScript, licensed under MIT license | ||
* - Angelo Zerr <[email protected]> - translation and adaptation to Java | ||
* - Fabio Zadrozny <[email protected]> - Convert uniqueId to Object (for identity compare) | ||
* - Fabio Zadrozny <[email protected]> - Utilities to convert between utf-8 and utf-16 | ||
*/ | ||
|
||
package org.eclipse.tm4e.core.internal.oniguruma; | ||
|
||
import java.util.Arrays; | ||
|
||
import org.jcodings.specific.UTF8Encoding; | ||
import java.nio.charset.StandardCharsets; | ||
|
||
/** | ||
|
@@ -28,8 +32,12 @@ | |
public class OnigString { | ||
|
||
private final String str; | ||
private byte[] value; | ||
private Object uniqueId; | ||
private final Object uniqueId; | ||
private final byte[] value; | ||
|
||
private int[] charsPosFromBytePos; | ||
private boolean computedOffsets; | ||
|
||
|
||
public OnigString(String str) { | ||
this.str = str; | ||
|
@@ -49,10 +57,90 @@ public byte[] utf8_value() { | |
} | ||
|
||
public int utf8_length() { | ||
return str.length(); | ||
return value.length; | ||
} | ||
|
||
public String getString() { | ||
return str; | ||
} | ||
|
||
public int convertUtf16OffsetToUtf8(int posInChars) { | ||
if(!computedOffsets) { | ||
computeOffsets(); | ||
} | ||
if (charsPosFromBytePos == null) { | ||
// Same conditions as code below, but taking into account that the | ||
// bytes and chars len are the same. | ||
if (posInChars < 0 || this.value.length == 0 || posInChars > this.value.length) { | ||
throw new ArrayIndexOutOfBoundsException(posInChars); | ||
} | ||
return posInChars; | ||
} | ||
|
||
int[] charsLenInBytes = charsPosFromBytePos; | ||
if (posInChars < 0 || charsLenInBytes.length == 0) { | ||
throw new ArrayIndexOutOfBoundsException(posInChars); | ||
} | ||
if (posInChars == 0) { | ||
return 0; | ||
} | ||
|
||
int last = charsLenInBytes[charsLenInBytes.length - 1]; | ||
if (last < posInChars) { | ||
if (last == posInChars - 1) { | ||
return charsLenInBytes.length; | ||
} else { | ||
throw new ArrayIndexOutOfBoundsException(posInChars); | ||
} | ||
} | ||
|
||
int index = Arrays.binarySearch(charsLenInBytes, posInChars); | ||
while (index > 0) { | ||
if (charsLenInBytes[index - 1] == posInChars) { | ||
index--; | ||
} else { | ||
break; | ||
} | ||
} | ||
return index; | ||
} | ||
|
||
public int convertUtf8OffsetToUtf16(int posInBytes) { | ||
if(!computedOffsets) { | ||
computeOffsets(); | ||
} | ||
if (charsPosFromBytePos == null) { | ||
return posInBytes; | ||
} | ||
if (posInBytes < 0) { | ||
return posInBytes; | ||
} | ||
if (posInBytes >= charsPosFromBytePos.length) { | ||
//One off can happen when finding the end of a regexp (it's the right boundary). | ||
return charsPosFromBytePos[posInBytes - 1] + 1; | ||
} | ||
return charsPosFromBytePos[posInBytes]; | ||
} | ||
|
||
private void computeOffsets() { | ||
if (this.value.length != this.str.length()) { | ||
charsPosFromBytePos = new int[this.value.length]; | ||
int bytesLen = 0;; | ||
int charsLen = 0; | ||
int length = this.value.length; | ||
for (int i = 0; i < length;) { | ||
int codeLen = UTF8Encoding.INSTANCE.length(this.value, i, length); | ||
for (int i1 = 0; i1 < codeLen; i1++) { | ||
charsPosFromBytePos[bytesLen + i1] = charsLen; | ||
} | ||
bytesLen += codeLen; | ||
i += codeLen; | ||
charsLen += 1; | ||
} | ||
if(bytesLen != this.value.length) { | ||
throw new AssertionError(bytesLen + " != "+this.value.length); | ||
} | ||
} | ||
computedOffsets = true; | ||
} | ||
} |
55 changes: 55 additions & 0 deletions
55
org.eclipse.tm4e.core/src/test/java/org/eclipse/tm4e/core/OnigStringTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
package org.eclipse.tm4e.core; | ||
|
||
import org.eclipse.tm4e.core.internal.oniguruma.OnigString; | ||
import org.junit.Assert; | ||
import org.junit.Test; | ||
|
||
public class OnigStringTest { | ||
|
||
@Test | ||
public void testUtf8Utf16Conversions() { | ||
OnigString onigString = new OnigString("áé"); | ||
Assert.assertEquals(onigString.utf8_length(), 4); | ||
Assert.assertEquals(onigString.getString().length(), 2); | ||
Assert.assertEquals(onigString.convertUtf8OffsetToUtf16(0), 0); | ||
} | ||
|
||
@Test | ||
public void testUtf8Utf16Conversions2() { | ||
|
||
String string = "myááçóúôõaab"; | ||
OnigString utf8WithCharLen = new OnigString(string); | ||
|
||
Assert.assertEquals(0, utf8WithCharLen.convertUtf16OffsetToUtf8(0)); | ||
Assert.assertEquals(1, utf8WithCharLen.convertUtf16OffsetToUtf8(1)); | ||
Assert.assertEquals(2, utf8WithCharLen.convertUtf16OffsetToUtf8(2)); | ||
Assert.assertEquals(4, utf8WithCharLen.convertUtf16OffsetToUtf8(3)); | ||
Assert.assertEquals(6, utf8WithCharLen.convertUtf16OffsetToUtf8(4)); | ||
Assert.assertEquals(8, utf8WithCharLen.convertUtf16OffsetToUtf8(5)); | ||
Assert.assertEquals(10, utf8WithCharLen.convertUtf16OffsetToUtf8(6)); | ||
Assert.assertEquals(12, utf8WithCharLen.convertUtf16OffsetToUtf8(7)); | ||
try { | ||
utf8WithCharLen.convertUtf16OffsetToUtf8(55); | ||
Assert.fail("Expected error"); | ||
} catch (ArrayIndexOutOfBoundsException e) { | ||
} | ||
|
||
Assert.assertEquals(0, utf8WithCharLen.convertUtf8OffsetToUtf16(0)); | ||
Assert.assertEquals(1, utf8WithCharLen.convertUtf8OffsetToUtf16(1)); | ||
Assert.assertEquals(2, utf8WithCharLen.convertUtf8OffsetToUtf16(2)); | ||
Assert.assertEquals(2, utf8WithCharLen.convertUtf8OffsetToUtf16(3)); | ||
Assert.assertEquals(3, utf8WithCharLen.convertUtf8OffsetToUtf16(4)); | ||
Assert.assertEquals(3, utf8WithCharLen.convertUtf8OffsetToUtf16(5)); | ||
Assert.assertEquals(4, utf8WithCharLen.convertUtf8OffsetToUtf16(6)); | ||
Assert.assertEquals(4, utf8WithCharLen.convertUtf8OffsetToUtf16(7)); | ||
Assert.assertEquals(5, utf8WithCharLen.convertUtf8OffsetToUtf16(8)); | ||
Assert.assertEquals(6, utf8WithCharLen.convertUtf8OffsetToUtf16(10)); | ||
Assert.assertEquals(7, utf8WithCharLen.convertUtf8OffsetToUtf16(12)); | ||
try { | ||
utf8WithCharLen.convertUtf8OffsetToUtf16(55); | ||
Assert.fail("Expected error"); | ||
} catch (ArrayIndexOutOfBoundsException e) { | ||
} | ||
|
||
} | ||
} |