Skip to content

Commit

Permalink
Regex for unicode-32 characters. Repeating operators. IntersectionRI. (
Browse files Browse the repository at this point in the history
…#57)

* fix regex for unicode-32 characters. remove repeating operators in neural parser. unit skips in IntersectionRI

* some tests added
  • Loading branch information
EgoLaparra authored Dec 23, 2019
1 parent 2ea4c4a commit 915bacc
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ class TemporalNeuralParser(modelStream: Option[InputStream] = None) extends Auto
}

def parseBatchToXML(text: String, spans: Array[(Int, Int)]): Array[Elem] = {
val antixmlCleanedText = """[^\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD]""".r.replaceAllIn(text, " ")
val antixmlCleanedText = """[^\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD]""".r.replaceAllIn(text, " " * _.group(0).length)

val allTimeSpans = identifyBatch(text, spans)
val timeSpanToId = allTimeSpans.flatten.zipWithIndex.toMap.mapValues(i => s"$i@id")
Expand Down Expand Up @@ -249,7 +249,7 @@ class TemporalNeuralParser(modelStream: Option[InputStream] = None) extends Auto
// for each batch, combine the different time operators into a single array
for (i <- expandedTexts.indices.toArray) yield {
val operators = nonOperators(i) ++ expOperators(i) ++ impOperators(i)
operators.sortBy { case (start, _, _) => start }
operators.sortBy { case (start, _, _) => start }.distinct
}
}

Expand Down
22 changes: 14 additions & 8 deletions src/main/scala/org/clulab/timenorm/scate/Types.scala
Original file line number Diff line number Diff line change
Expand Up @@ -842,14 +842,17 @@ case class IntersectionRI(repeatingIntervals: Set[RepeatingInterval],
if (startInterval.start.plus(1, range).isBefore(ldt)) {
startPoint = startPoint.plus(1, range)
}
val iterators = sortedRepeatingIntervals.map(_.preceding(startPoint).buffered)
var iterators = sortedRepeatingIntervals.map(_.preceding(startPoint).buffered)

Iterator.continually {
startPoint = startPoint.minus(1, range)
do {
startPoint = startPoint.minus(1, range)
} while(iterators.exists(startPoint isAfter _.head.start))
val firstInterval = iterators.head.next
val othersAfterStart = iterators.tail.map(it => it.takeWhile(_ => it.head.start isAfter startPoint).toList)
val (othersAfterStart, rest) = iterators.tail.map(_.span(_.start isAfter startPoint)).unzip
iterators = iterators.head :: rest.map(_.buffered)

othersAfterStart.iterator.foldLeft(List(firstInterval)) {
othersAfterStart.map(_.toList).iterator.foldLeft(List(firstInterval)) {
(intersectedIntervals, newIntervals) => newIntervals.filter(overlapsWith(_, intersectedIntervals))
}
}.flatten
Expand All @@ -862,14 +865,17 @@ case class IntersectionRI(repeatingIntervals: Set[RepeatingInterval],
if (startInterval.end.minus(1, range).isAfter(ldt)) {
startPoint = startPoint.minus(1, range)
}
val iterators = sortedRepeatingIntervals.map(_.following(startPoint).buffered)
var iterators = sortedRepeatingIntervals.map(_.following(startPoint).buffered)

Iterator.continually {
startPoint = startPoint.plus(1, range)
do {
startPoint = startPoint.plus(1, range)
} while(iterators.exists(startPoint isBefore _.head.end))
val firstInterval = iterators.head.next
val othersBeforeStart = iterators.tail.map(it => it.takeWhile(_ => it.head.end isBefore startPoint).toList)
val (othersBeforeStart, rest) = iterators.tail.map(_.span(_.start isBefore startPoint)).unzip
iterators = iterators.head :: rest.map(_.buffered)

othersBeforeStart.iterator.foldLeft(List(firstInterval)) {
othersBeforeStart.map(_.toList).iterator.foldLeft(List(firstInterval)) {
(intersectedIntervals, newIntervals) => newIntervals.filter(overlapsWith(_, intersectedIntervals))
}
}.flatten
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,18 @@ class TemporalNeuralParserTest extends FunSuite with BeforeAndAfterAll with Type
assert(ids === ids.distinct)
}

test("no-duplicate-ids-2") {
// for "last" in text below, the parser generated explicit and implicit Last operators at the same time with same IDs
// this caused an infinite loop in the normalization
val xml = parser.parseToXML(
"""
|up to 2.6 percent in 2017 from last year's 1.4 percent,
|the IMF said on Monday in its latest Regional Economic Outlook
""".stripMargin)
val ids = (xml \\ "id").map(_.text)
assert(ids === ids.distinct)
}

test("number-too-long"){
// 20110805000336031965 is too long to be converted to Long type.
// parseToXML should be able to produce a well formed XML with Value equal to 20110805000336031965
Expand All @@ -163,4 +175,11 @@ class TemporalNeuralParserTest extends FunSuite with BeforeAndAfterAll with Type
// parse should throw an AnaforaReader.Exception
intercept[AnaforaReader.Exception] { parser.parse("20110805000336031965") }
}

test("4-bytes-unicode-character") {
// 4-bytes unicode characters must be replaced by 2 space characters
// otherwise parser produces incorrect spans for later time expressions
val Array(year2013: Interval) = parser.parse("the mean value (\uD835\uDF07) for 2013")
assert(year2013.charSpan === Some((24,28)))
}
}
28 changes: 28 additions & 0 deletions src/test/scala/org/clulab/timenorm/scate/TypesTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -930,6 +930,34 @@ class TypesTest extends FunSuite with TypesSuite {
))
fail("April 31st should throw an exception")
}


// In the following cases IntersectRI must skip one range unit.
//Interval: March 1, 2012
val mar312012 = SimpleInterval.of(2012, 3, 1)
//RepeatingInterval: The evening of the 31st
val intersectRIWithSkips = IntersectionRI(
Set(
RepeatingField(ChronoField.DAY_OF_MONTH, 31),
RepeatingField(org.clulab.time.EVENING_OF_DAY, 1)
))

val followingWithSkips = intersectRIWithSkips.following(mar312012.start)
//Expected: March 31 2012 @ 1700, April 1 2012 @ 0000
next = followingWithSkips.next
assert(next.start === LocalDateTime.of(2012, 3, 31, 17, 0))
assert(next.end === LocalDateTime.of(2012, 4, 1, 0, 0))

//Expected: May 31 2012 @ 1700, June 1 2012 @ 0000
next = followingWithSkips.next
assert(next.start === LocalDateTime.of(2012, 5, 31, 17, 0))
assert(next.end === LocalDateTime.of(2012, 6, 1, 0, 0))

//Expected: January 31 2012 @ 1700, February 1 2012 @ 0000
val precedingWithSkips = intersectRIWithSkips.preceding(mar312012.start)
next = precedingWithSkips.next
assert(next.start === LocalDateTime.of(2012, 1, 31, 17, 0))
assert(next.end === LocalDateTime.of(2012, 2, 1, 0, 0))
}

test("IntersectionI") {
Expand Down

0 comments on commit 915bacc

Please sign in to comment.