Skip to content

Commit

Permalink
Related to #27 (crfsuite). Allow to reconstruct the original text + a…
Browse files Browse the repository at this point in the history
…llow to add a from/to field in as.data.frame (useful but undocumented feature).
  • Loading branch information
jwijffels committed Aug 28, 2018
1 parent 50931b0 commit 1ddfdbc
Show file tree
Hide file tree
Showing 6 changed files with 157 additions and 4 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ LICENSE
dutch-ud-2.0-170801.udpipe
sanskrit-ud-2.0-170801.udpipe
dutch-lassysmall-ud-2.0-170801.udpipe
spanish-ud-2.0-170801.udpipe
docusaurus/
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ dev
.udpipe_process.log
dutch-ud-2.0-170801.udpipe
sanskrit-ud-2.0-170801.udpipe
dutch-lassysmall-ud-2.0-170801.udpipe
dutch-lassysmall-ud-2.0-170801.udpipe
spanish-ud-2.0-170801.udpipe
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: udpipe
Type: Package
Title: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit
Version: 0.6.1
Version: 0.6.2
Maintainer: Jan Wijffels <[email protected]>
Authors@R: c(person('Jan', 'Wijffels', role = c('aut', 'cre', 'cph'), email = '[email protected]'),
person('BNOSAC', role = 'cph'),
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# CHANGES IN udpipe VERSION 0.6.2

- Allow to reconstruct the original text + allow to add a from/to field in as.data.frame (useful but undocumented feature)

# CHANGES IN udpipe VERSION 0.6.1

- src/udpipe.cpp: at the request of CRAN: remove dynamic execution specification which g++-7 and later complain about by removing the throw statements
Expand Down
15 changes: 13 additions & 2 deletions R/udpipe_parse.R
Original file line number Diff line number Diff line change
Expand Up @@ -160,13 +160,20 @@ read_connlu <- function(x, is_udpipe_annotation = FALSE, ...){
output_fields <- append(output_fields, values = "term_id", after = 4)
}
}
if("from_to" %in% names(ldots)){
if(isTRUE(ldots$from_to)){
output_fields <- append(output_fields, values = c("from", "to"), after = 4)
}
}
## Default output
default <- data.frame(doc_id = character(),
paragraph_id = integer(),
sentence_id = character(),
sentence = character(),
sentence = character(),
from = integer(),
to = integer(),
term_id = integer(),
token_id = character(),
token_id = character(),
token = character(),
lemma = character(),
upos = character(),
Expand Down Expand Up @@ -240,6 +247,10 @@ read_connlu <- function(x, is_udpipe_annotation = FALSE, ...){
out[, dep_rel := underscore_as_na(dep_rel)]
out[, deps := underscore_as_na(deps)]
out[, misc := underscore_as_na(misc)]
if(all(c("from", "to") %in% output_fields)){
out[, c("from", "to") := udpipe_reconstruct(sentence_id = sentence_id, token = token, token_id = token_id, misc = misc, only_from_to = TRUE),
by = list(doc_id)]
}
out <- out[, output_fields, with = FALSE]
out <- data.table::setDF(out)
out
Expand Down
136 changes: 136 additions & 0 deletions R/udpipe_reconstruct.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
if(FALSE){
library(udpipe)
txt <- " Maxime y su mujer\\ hicieron que nuestra estancia
fuera lo mas comoda posible. \n
El primer dia Maxime nos espero hasta tarde para recibirnos y
darnos todas las indicaciones posibles del apartamento y
de la situacion de aparcamiento en el barrio ya que fuimos
desde Espana con el coche ( es todo zona azul de 9:00 a 18:00 pero
como saliamos pronto y llegabamos tarde no nos afectaba).\n
El apartamento es muy completo, la verdad es como aparece
en el anuncio, es mas, incluso tiene una barandilla
en la escaleras que dan a la habitacion que en la foto no sale.\n
El jardin esta muy bien para desayunar o cenar ya que
tiene una mesa grande para ello.\n
El barrio es muy tranquilo con bastantes tiendas y restaurantes.\n
En general estuvimos muy comodos durante nuestra estancia,
repetiriamos ahora mismo.\n Muchas gracias por todo Maxime. "
ud_model <- udpipe_download_model(language = "spanish")
ud_model <- udpipe_load_model(ud_model$file_model)
x <- udpipe_annotate(ud_model, x = txt)
x <- as.data.frame(x, from_to = TRUE)
original <- udpipe_reconstruct(sentence_id = x$sentence_id, token = x$token, token_id = x$token_id, misc = x$misc)
}


udpipe_reconstruct <- function(sentence_id, token, token_id, misc, only_from_to = FALSE){

##
## FROM THE UDPIPE DOCS:
##

# The markup uses the following MISC fields on tokens (not words in multi-word tokens):
# SpacesBefore=content (by default empty): spaces/other content preceding the token
# SpacesAfter=content (by default a space if SpaceAfter=No feature is not present, empty otherwise): spaces/other content following the token
# SpacesInToken=content (by default equal to the FORM of the token): FORM of the token including original spaces (this is needed only if tokens are allowed to contain spaces and a token contains a tab or newline characters)

# The content of all the three fields must be escaped to allow storing tabs and newlines. The following C-like schema is used:
# \s: space
# \t: tab
# \r: CR character
# \n: LF character
# \p: | (pipe character)
# \\: \ (backslash character)

rawtxt <- token

has_spacesafter_no <- grepl(pattern = "SpaceAfter=No", misc)
has_spacesafter <- grepl(pattern = "SpacesAfter=", misc)
has_spacesbefore <- grepl(pattern = "SpacesBefore=", misc)
has_spacesintoken <- grepl(pattern = "SpacesInToken=", misc)

##
## Spaces after
##
after <- rep("", length(token))
## if no spaceafter feature, there is a space
after[!has_spacesafter] <- " "
## if missing, there is a space after
after[is.na(misc)] <- " "
## if contains SpaceAfter=No, there is nothing to add
after[has_spacesafter_no] <- ""
## if contains SpacesAfter=, add the spaces to the after part
idx <- which(has_spacesafter)
addme <- gsub(pattern = "(SpacesAfter=)(.+)($|Spaces)", "\\2", misc[idx])
addme <- gsub("\\\\s", " ", addme)
addme <- gsub("\\\\n", "\n", addme)
addme <- gsub("\\\\t", "\t", addme)
addme <- gsub("\\\\r", "\r", addme)
addme <- gsub("\\\\p", "|", addme)
addme <- gsub("\\\\", "\\", addme)
after[idx] <- addme
## Fix for using std::istringstream in udpipe_annotate as it always ends with a newline character
after[length(after)] <- gsub("\n$", "", after[length(after)])

##
## Spaces before
##
before <- rep("", length(token))
## if contains SpacesBefore=, add the spaces to the after part
idx <- which(has_spacesbefore)
addme <- gsub(pattern = "(SpacesBefore=)(.+)($|Spaces)", "\\2", misc[idx])
addme <- gsub("\\\\s", " ", addme)
addme <- gsub("\\\\n", "\n", addme)
addme <- gsub("\\\\t", "\t", addme)
addme <- gsub("\\\\r", "\r", addme)
addme <- gsub("\\\\p", "|", addme)
addme <- gsub("\\\\", "\\", addme)
before[idx] <- addme

##
## SpacesInToken - MISC field stores form of the token including original spaces if there is a space in the token which can not be handled by FORM
##
idx <- which(has_spacesintoken)
token[idx] <- gsub(pattern = "(SpacesInToken=)(.+)($|Spaces)", "\\2", misc[idx])

##
## Construct original text
##
original_txt <- sprintf("%s%s%s", before, token, after)

##
## Multi-word tokens are not considered
##
is_multi_word <- grepl("-", token_id)
ids <- sprintf("%s.%s", sentence_id, token_id)
ids_remove <- mapply(sentence_id = sentence_id[is_multi_word],
token_id = token_id[is_multi_word],
FUN=function(sentence_id, token_id){
sprintf("%s.%s", sentence_id, unlist(strsplit(token_id, split = "-")))
}, SIMPLIFY = TRUE, USE.NAMES = FALSE)
idx <- which(ids %in% ids_remove)
original_txt[idx] <- ""

##
## Construct from-to
##
before[idx] <- ""
after[idx] <- ""

nchars <- nchar(original_txt)
original_to <- cumsum(nchars)
original_from <- original_to - nchars + 1L
from <- original_from + nchar(before)
to <- original_to - nchar(after)
from[idx] <- NA_integer_
to[idx] <- NA_integer_


if(only_from_to){
return(list(from = from, to = to))
}else{
return(list(text = paste(original_txt, collapse = ""),
from = from,
to = to))
}
}

0 comments on commit 1ddfdbc

Please sign in to comment.