Skip to content

Commit

Permalink
Merge pull request #146 from fabioz/master
Browse files Browse the repository at this point in the history
Utilities to convert to/from utf-8, utf-16.
  • Loading branch information
angelozerr authored Sep 19, 2017
2 parents bcd0b1e + 9594e0a commit e0a751a
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,14 @@
* - GitHub Inc.: Initial code, written in JavaScript, licensed under MIT license
* - Angelo Zerr <[email protected]> - translation and adaptation to Java
* - Fabio Zadrozny <[email protected]> - Convert uniqueId to Object (for identity compare)
* - Fabio Zadrozny <[email protected]> - Utilities to convert between utf-8 and utf-16
*/

package org.eclipse.tm4e.core.internal.oniguruma;

import java.util.Arrays;

import org.jcodings.specific.UTF8Encoding;
import java.nio.charset.StandardCharsets;

/**
Expand All @@ -28,8 +32,12 @@
public class OnigString {

private final String str;
private byte[] value;
private Object uniqueId;
private final Object uniqueId;
private final byte[] value;

private int[] charsPosFromBytePos;
private boolean computedOffsets;


public OnigString(String str) {
this.str = str;
Expand All @@ -49,10 +57,90 @@ public byte[] utf8_value() {
}

public int utf8_length() {
return str.length();
return value.length;
}

public String getString() {
return str;
}

public int convertUtf16OffsetToUtf8(int posInChars) {
if(!computedOffsets) {
computeOffsets();
}
if (charsPosFromBytePos == null) {
// Same conditions as code below, but taking into account that the
// bytes and chars len are the same.
if (posInChars < 0 || this.value.length == 0 || posInChars > this.value.length) {
throw new ArrayIndexOutOfBoundsException(posInChars);
}
return posInChars;
}

int[] charsLenInBytes = charsPosFromBytePos;
if (posInChars < 0 || charsLenInBytes.length == 0) {
throw new ArrayIndexOutOfBoundsException(posInChars);
}
if (posInChars == 0) {
return 0;
}

int last = charsLenInBytes[charsLenInBytes.length - 1];
if (last < posInChars) {
if (last == posInChars - 1) {
return charsLenInBytes.length;
} else {
throw new ArrayIndexOutOfBoundsException(posInChars);
}
}

int index = Arrays.binarySearch(charsLenInBytes, posInChars);
while (index > 0) {
if (charsLenInBytes[index - 1] == posInChars) {
index--;
} else {
break;
}
}
return index;
}

public int convertUtf8OffsetToUtf16(int posInBytes) {
if(!computedOffsets) {
computeOffsets();
}
if (charsPosFromBytePos == null) {
return posInBytes;
}
if (posInBytes < 0) {
return posInBytes;
}
if (posInBytes >= charsPosFromBytePos.length) {
//One off can happen when finding the end of a regexp (it's the right boundary).
return charsPosFromBytePos[posInBytes - 1] + 1;
}
return charsPosFromBytePos[posInBytes];
}

private void computeOffsets() {
if (this.value.length != this.str.length()) {
charsPosFromBytePos = new int[this.value.length];
int bytesLen = 0;;
int charsLen = 0;
int length = this.value.length;
for (int i = 0; i < length;) {
int codeLen = UTF8Encoding.INSTANCE.length(this.value, i, length);
for (int i1 = 0; i1 < codeLen; i1++) {
charsPosFromBytePos[bytesLen + i1] = charsLen;
}
bytesLen += codeLen;
i += codeLen;
charsLen += 1;
}
if(bytesLen != this.value.length) {
throw new AssertionError(bytesLen + " != "+this.value.length);
}
}
computedOffsets = true;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package org.eclipse.tm4e.core;

import org.eclipse.tm4e.core.internal.oniguruma.OnigString;
import org.junit.Assert;
import org.junit.Test;

public class OnigStringTest {

@Test
public void testUtf8Utf16Conversions() {
OnigString onigString = new OnigString("áé");
Assert.assertEquals(onigString.utf8_length(), 4);
Assert.assertEquals(onigString.getString().length(), 2);
Assert.assertEquals(onigString.convertUtf8OffsetToUtf16(0), 0);
}

@Test
public void testUtf8Utf16Conversions2() {

String string = "myááçóúôõaab";
OnigString utf8WithCharLen = new OnigString(string);

Assert.assertEquals(0, utf8WithCharLen.convertUtf16OffsetToUtf8(0));
Assert.assertEquals(1, utf8WithCharLen.convertUtf16OffsetToUtf8(1));
Assert.assertEquals(2, utf8WithCharLen.convertUtf16OffsetToUtf8(2));
Assert.assertEquals(4, utf8WithCharLen.convertUtf16OffsetToUtf8(3));
Assert.assertEquals(6, utf8WithCharLen.convertUtf16OffsetToUtf8(4));
Assert.assertEquals(8, utf8WithCharLen.convertUtf16OffsetToUtf8(5));
Assert.assertEquals(10, utf8WithCharLen.convertUtf16OffsetToUtf8(6));
Assert.assertEquals(12, utf8WithCharLen.convertUtf16OffsetToUtf8(7));
try {
utf8WithCharLen.convertUtf16OffsetToUtf8(55);
Assert.fail("Expected error");
} catch (ArrayIndexOutOfBoundsException e) {
}

Assert.assertEquals(0, utf8WithCharLen.convertUtf8OffsetToUtf16(0));
Assert.assertEquals(1, utf8WithCharLen.convertUtf8OffsetToUtf16(1));
Assert.assertEquals(2, utf8WithCharLen.convertUtf8OffsetToUtf16(2));
Assert.assertEquals(2, utf8WithCharLen.convertUtf8OffsetToUtf16(3));
Assert.assertEquals(3, utf8WithCharLen.convertUtf8OffsetToUtf16(4));
Assert.assertEquals(3, utf8WithCharLen.convertUtf8OffsetToUtf16(5));
Assert.assertEquals(4, utf8WithCharLen.convertUtf8OffsetToUtf16(6));
Assert.assertEquals(4, utf8WithCharLen.convertUtf8OffsetToUtf16(7));
Assert.assertEquals(5, utf8WithCharLen.convertUtf8OffsetToUtf16(8));
Assert.assertEquals(6, utf8WithCharLen.convertUtf8OffsetToUtf16(10));
Assert.assertEquals(7, utf8WithCharLen.convertUtf8OffsetToUtf16(12));
try {
utf8WithCharLen.convertUtf8OffsetToUtf16(55);
Assert.fail("Expected error");
} catch (ArrayIndexOutOfBoundsException e) {
}

}
}

0 comments on commit e0a751a

Please sign in to comment.