Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Utilities to convert to/from utf-8, utf-16. #146

Merged
merged 2 commits into from
Sep 19, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,14 @@
* - GitHub Inc.: Initial code, written in JavaScript, licensed under MIT license
* - Angelo Zerr <[email protected]> - translation and adaptation to Java
* - Fabio Zadrozny <[email protected]> - Convert uniqueId to Object (for identity compare)
* - Fabio Zadrozny <[email protected]> - Utilities to convert between utf-8 and utf-16
*/

package org.eclipse.tm4e.core.internal.oniguruma;

import java.util.Arrays;

import org.jcodings.specific.UTF8Encoding;
import java.nio.charset.StandardCharsets;

/**
Expand All @@ -28,8 +32,12 @@
public class OnigString {

private final String str;
private byte[] value;
private Object uniqueId;
private final Object uniqueId;
private final byte[] value;

private int[] charsPosFromBytePos;
private boolean computedOffsets;


public OnigString(String str) {
this.str = str;
Expand All @@ -49,10 +57,90 @@ public byte[] utf8_value() {
}

public int utf8_length() {
return str.length();
return value.length;
}

public String getString() {
return str;
}

public int convertUtf16OffsetToUtf8(int posInChars) {
if(!computedOffsets) {
computeOffsets();
}
if (charsPosFromBytePos == null) {
// Same conditions as code below, but taking into account that the
// bytes and chars len are the same.
if (posInChars < 0 || this.value.length == 0 || posInChars > this.value.length) {
throw new ArrayIndexOutOfBoundsException(posInChars);
}
return posInChars;
}

int[] charsLenInBytes = charsPosFromBytePos;
if (posInChars < 0 || charsLenInBytes.length == 0) {
throw new ArrayIndexOutOfBoundsException(posInChars);
}
if (posInChars == 0) {
return 0;
}

int last = charsLenInBytes[charsLenInBytes.length - 1];
if (last < posInChars) {
if (last == posInChars - 1) {
return charsLenInBytes.length;
} else {
throw new ArrayIndexOutOfBoundsException(posInChars);
}
}

int index = Arrays.binarySearch(charsLenInBytes, posInChars);
while (index > 0) {
if (charsLenInBytes[index - 1] == posInChars) {
index--;
} else {
break;
}
}
return index;
}

public int convertUtf8OffsetToUtf16(int posInBytes) {
if(!computedOffsets) {
computeOffsets();
}
if (charsPosFromBytePos == null) {
return posInBytes;
}
if (posInBytes < 0) {
return posInBytes;
}
if (posInBytes >= charsPosFromBytePos.length) {
//One off can happen when finding the end of a regexp (it's the right boundary).
return charsPosFromBytePos[posInBytes - 1] + 1;
}
return charsPosFromBytePos[posInBytes];
}

private void computeOffsets() {
if (this.value.length != this.str.length()) {
charsPosFromBytePos = new int[this.value.length];
int bytesLen = 0;;
int charsLen = 0;
int length = this.value.length;
for (int i = 0; i < length;) {
int codeLen = UTF8Encoding.INSTANCE.length(this.value, i, length);
for (int i1 = 0; i1 < codeLen; i1++) {
charsPosFromBytePos[bytesLen + i1] = charsLen;
}
bytesLen += codeLen;
i += codeLen;
charsLen += 1;
}
if(bytesLen != this.value.length) {
throw new AssertionError(bytesLen + " != "+this.value.length);
}
}
computedOffsets = true;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package org.eclipse.tm4e.core;

import org.eclipse.tm4e.core.internal.oniguruma.OnigString;
import org.junit.Assert;
import org.junit.Test;

public class OnigStringTest {

@Test
public void testUtf8Utf16Conversions() {
OnigString onigString = new OnigString("áé");
Assert.assertEquals(onigString.utf8_length(), 4);
Assert.assertEquals(onigString.getString().length(), 2);
Assert.assertEquals(onigString.convertUtf8OffsetToUtf16(0), 0);
}

@Test
public void testUtf8Utf16Conversions2() {

String string = "myááçóúôõaab";
OnigString utf8WithCharLen = new OnigString(string);

Assert.assertEquals(0, utf8WithCharLen.convertUtf16OffsetToUtf8(0));
Assert.assertEquals(1, utf8WithCharLen.convertUtf16OffsetToUtf8(1));
Assert.assertEquals(2, utf8WithCharLen.convertUtf16OffsetToUtf8(2));
Assert.assertEquals(4, utf8WithCharLen.convertUtf16OffsetToUtf8(3));
Assert.assertEquals(6, utf8WithCharLen.convertUtf16OffsetToUtf8(4));
Assert.assertEquals(8, utf8WithCharLen.convertUtf16OffsetToUtf8(5));
Assert.assertEquals(10, utf8WithCharLen.convertUtf16OffsetToUtf8(6));
Assert.assertEquals(12, utf8WithCharLen.convertUtf16OffsetToUtf8(7));
try {
utf8WithCharLen.convertUtf16OffsetToUtf8(55);
Assert.fail("Expected error");
} catch (ArrayIndexOutOfBoundsException e) {
}

Assert.assertEquals(0, utf8WithCharLen.convertUtf8OffsetToUtf16(0));
Assert.assertEquals(1, utf8WithCharLen.convertUtf8OffsetToUtf16(1));
Assert.assertEquals(2, utf8WithCharLen.convertUtf8OffsetToUtf16(2));
Assert.assertEquals(2, utf8WithCharLen.convertUtf8OffsetToUtf16(3));
Assert.assertEquals(3, utf8WithCharLen.convertUtf8OffsetToUtf16(4));
Assert.assertEquals(3, utf8WithCharLen.convertUtf8OffsetToUtf16(5));
Assert.assertEquals(4, utf8WithCharLen.convertUtf8OffsetToUtf16(6));
Assert.assertEquals(4, utf8WithCharLen.convertUtf8OffsetToUtf16(7));
Assert.assertEquals(5, utf8WithCharLen.convertUtf8OffsetToUtf16(8));
Assert.assertEquals(6, utf8WithCharLen.convertUtf8OffsetToUtf16(10));
Assert.assertEquals(7, utf8WithCharLen.convertUtf8OffsetToUtf16(12));
try {
utf8WithCharLen.convertUtf8OffsetToUtf16(55);
Assert.fail("Expected error");
} catch (ArrayIndexOutOfBoundsException e) {
}

}
}