Skip to content

Commit

Permalink
Add missing crawl_date column to binary information jobs. (#513)
Browse files Browse the repository at this point in the history
- Update tests
- Resolves #512
  • Loading branch information
ruebot authored Apr 29, 2021
1 parent 11ebeb7 commit 2b1deb8
Show file tree
Hide file tree
Showing 14 changed files with 60 additions and 46 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ object AudioInformationExtractor {
import spark.implicits._
// scalastyle:on
d.select(
$"crawl_date",
$"url",
$"filename",
$"extension",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ object ImageInformationExtractor {
import spark.implicits._
// scalastyle:on
d.select(
$"crawl_date",
$"url",
$"filename",
$"extension",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ object PDFInformationExtractor {
import spark.implicits._
// scalastyle:on
d.select(
$"crawl_date",
$"url",
$"filename",
$"extension",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ object PresentationProgramInformationExtractor {
import spark.implicits._
// scalastyle:on
d.select(
$"crawl_date",
$"url",
$"filename",
$"extension",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ object SpreadsheetInformationExtractor {
import spark.implicits._
// scalastyle:on
d.select(
$"crawl_date",
$"url",
$"filename",
$"extension",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ object VideoInformationExtractor {
import spark.implicits._
// scalastyle:on
d.select(
$"crawl_date",
$"url",
$"filename",
$"extension",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ object WordProcessorInformationExtractor {
import spark.implicits._
// scalastyle:on
d.select(
$"crawl_date",
$"url",
$"filename",
$"extension",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,14 @@ class AudioInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 1

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "https://ruebot.net/files/feniz.mp3")
assert(dfResults(0).get(1) == "feniz.mp3")
assert(dfResults(0).get(2) == "mp3")
assert(dfResults(0).get(3) == "audio/mpeg")
assert(dfResults(0).get(0) == "20190817")
assert(dfResults(0).get(1) == "https://ruebot.net/files/feniz.mp3")
assert(dfResults(0).get(2) == "feniz.mp3")
assert(dfResults(0).get(3) == "mp3")
assert(dfResults(0).get(4) == "audio/mpeg")
assert(dfResults(0).get(5) == "f7e7ec84b12c294e19af1ba41732c733")
assert(dfResults(0).get(6) == "a3eb95dbbea76460529d0d9ebdde5faabaff544a")
assert(dfResults(0).get(5) == "audio/mpeg")
assert(dfResults(0).get(6) == "f7e7ec84b12c294e19af1ba41732c733")
assert(dfResults(0).get(7) == "a3eb95dbbea76460529d0d9ebdde5faabaff544a")
}

after {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,16 @@ class ImageInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 55

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "http://www.archive.org/images/logoc.jpg")
assert(dfResults(0).get(1) == "logoc.jpg")
assert(dfResults(0).get(2) == "jpg")
assert(dfResults(0).get(3) == "image/jpeg")
assert(dfResults(0).get(0) == "20080430")
assert(dfResults(0).get(1) == "http://www.archive.org/images/logoc.jpg")
assert(dfResults(0).get(2) == "logoc.jpg")
assert(dfResults(0).get(3) == "jpg")
assert(dfResults(0).get(4) == "image/jpeg")
assert(dfResults(0).get(5) == 70)
assert(dfResults(0).get(6) == 56)
assert(dfResults(0).get(7) == "8211d1fbb9b03d8522a1ae378f9d1b24")
assert(dfResults(0).get(8) == "a671e68fc211ee4996a91e99297f246b2c5faa1a")
assert(dfResults(0).get(5) == "image/jpeg")
assert(dfResults(0).get(6) == 70)
assert(dfResults(0).get(7) == 56)
assert(dfResults(0).get(8) == "8211d1fbb9b03d8522a1ae378f9d1b24")
assert(dfResults(0).get(9) == "a671e68fc211ee4996a91e99297f246b2c5faa1a")
}

after {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,18 @@ class PDFInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 2

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20190812")
assert(
dfResults(0).get(
0
1
) == "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y"
)
assert(dfResults(0).get(1) == "cost-analysis.pdf")
assert(dfResults(0).get(2) == "pdf")
assert(dfResults(0).get(3) == "application/pdf")
assert(dfResults(0).get(2) == "cost-analysis.pdf")
assert(dfResults(0).get(3) == "pdf")
assert(dfResults(0).get(4) == "application/pdf")
assert(dfResults(0).get(5) == "aaba59d2287afd40c996488a39bbc0dd")
assert(dfResults(0).get(6) == "569c28e0e8faa6945d6ca88fcd9e195825052c71")
assert(dfResults(0).get(5) == "application/pdf")
assert(dfResults(0).get(6) == "aaba59d2287afd40c996488a39bbc0dd")
assert(dfResults(0).get(7) == "569c28e0e8faa6945d6ca88fcd9e195825052c71")
}

after {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,25 +47,26 @@ class PresentationProgramInformationExtractorTest
val RESULTSLENGTH = 2

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20190815")
assert(
dfResults(0).get(
0
1
) == "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx"
)
assert(dfResults(0).get(1) == "aut-test-fixtures.pptx")
assert(dfResults(0).get(2) == "pptx")
assert(dfResults(0).get(2) == "aut-test-fixtures.pptx")
assert(dfResults(0).get(3) == "pptx")
assert(
dfResults(0).get(
3
4
) == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
)
assert(
dfResults(0).get(
4
5
) == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
)
assert(dfResults(0).get(5) == "7a7b1fe4b6d311376eaced9de3b682ee")
assert(dfResults(0).get(6) == "86fadca47b134b68247ccde62da4ce3f62b4d2ec")
assert(dfResults(0).get(6) == "7a7b1fe4b6d311376eaced9de3b682ee")
assert(dfResults(0).get(7) == "86fadca47b134b68247ccde62da4ce3f62b4d2ec")
}

after {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,21 +45,22 @@ class SpreadsheetInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 4

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20190815")
assert(
dfResults(0).get(
0
1
) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods"
)
assert(dfResults(0).get(1) == "test-aut-fixture.ods")
assert(dfResults(0).get(2) == "ods")
assert(dfResults(0).get(2) == "test-aut-fixture.ods")
assert(dfResults(0).get(3) == "ods")
assert(
dfResults(0).get(3) == "application/vnd.oasis.opendocument.spreadsheet"
dfResults(0).get(4) == "application/vnd.oasis.opendocument.spreadsheet"
)
assert(
dfResults(0).get(4) == "application/vnd.oasis.opendocument.spreadsheet"
dfResults(0).get(5) == "application/vnd.oasis.opendocument.spreadsheet"
)
assert(dfResults(0).get(5) == "7f70280757d8beb2d1bfd6fb1b6ae6e9")
assert(dfResults(0).get(6) == "448c357e78317877a98a399448031a89f1dda6fb")
assert(dfResults(0).get(6) == "7f70280757d8beb2d1bfd6fb1b6ae6e9")
assert(dfResults(0).get(7) == "448c357e78317877a98a399448031a89f1dda6fb")
}

after {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,16 @@ class VideoInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 1

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20190817")
assert(
dfResults(0).get(0) == "https://ruebot.net/2018-11-12%2016.14.11.mp4"
dfResults(0).get(1) == "https://ruebot.net/2018-11-12%2016.14.11.mp4"
)
assert(dfResults(0).get(1) == "2018-11-12%2016.14.11.mp4")
assert(dfResults(0).get(2) == "mp4")
assert(dfResults(0).get(3) == "video/mp4")
assert(dfResults(0).get(2) == "2018-11-12%2016.14.11.mp4")
assert(dfResults(0).get(3) == "mp4")
assert(dfResults(0).get(4) == "video/mp4")
assert(dfResults(0).get(5) == "2cde7de3213a87269957033f6315fce2")
assert(dfResults(0).get(6) == "f28c72fa4c0464a1a2b81fdc539b28cf574ac4c2")
assert(dfResults(0).get(5) == "video/mp4")
assert(dfResults(0).get(6) == "2cde7de3213a87269957033f6315fce2")
assert(dfResults(0).get(7) == "f28c72fa4c0464a1a2b81fdc539b28cf574ac4c2")
}

after {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,17 +47,18 @@ class WordProcessorInformationExtractorTest
val RESULTSLENGTH = 3

assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "20190815")
assert(
dfResults(0).get(
0
1
) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf"
)
assert(dfResults(0).get(1) == "test-aut-fixtures.rtf")
assert(dfResults(0).get(2) == "rtf")
assert(dfResults(0).get(3) == "application/rtf")
assert(dfResults(0).get(2) == "test-aut-fixtures.rtf")
assert(dfResults(0).get(3) == "rtf")
assert(dfResults(0).get(4) == "application/rtf")
assert(dfResults(0).get(5) == "e483512b65ba44d71e843c57de2adeb7")
assert(dfResults(0).get(6) == "8cf3066421f0a07fcd6e7a3e86ebd447edf7cfcb")
assert(dfResults(0).get(5) == "application/rtf")
assert(dfResults(0).get(6) == "e483512b65ba44d71e843c57de2adeb7")
assert(dfResults(0).get(7) == "8cf3066421f0a07fcd6e7a3e86ebd447edf7cfcb")
}

after {
Expand Down

0 comments on commit 2b1deb8

Please sign in to comment.