Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
333 changes: 189 additions & 144 deletions src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,31 @@ public class ContentIndexer {

private static final int MEDIA_FILE_SIZE_LIMIT = 300 * 1024; // Bytes
private static final int NANOSECONDS_IN_A_MILLISECOND = 1000000;
private static final String ERROR_OCCURRED_SUFFIX = ". The following error occurred: ";

private static class IndexingContext {
final Map<String, Content> contentCache;
final Set<String> tagsList;
final Map<String, String> allUnits;
final Map<String, String> publishedUnits;
final Map<Content, List<String>> indexProblemCache;
final boolean includeUnpublished;

IndexingContext(final Map<String, Content> contentCache, final Set<String> tagsList,
final Map<String, String> allUnits, final Map<String, String> publishedUnits,
final Map<Content, List<String>> indexProblemCache, final boolean includeUnpublished) {
this.contentCache = contentCache;
this.tagsList = tagsList;
this.allUnits = allUnits;
this.publishedUnits = publishedUnits;
this.indexProblemCache = indexProblemCache;
this.includeUnpublished = includeUnpublished;
}

boolean shouldSkipUnpublished(final Content content) {
return !includeUnpublished && !content.getPublished();
}
}

@Inject
public ContentIndexer(final GitDb database, final ElasticSearchIndexer es, final ContentMapperUtils mapperUtils) {
Expand Down Expand Up @@ -212,46 +237,10 @@ private synchronized void buildGitContentIndex(final String sha,
log.info("Populating git content cache based on sha " + sanitiseInternalLogValue(sha) + " ...");

// Traverse the git repository looking for the .json files
IndexingContext context = new IndexingContext(contentCache, tagsList, allUnits, publishedUnits,
indexProblemCache, includeUnpublished);
while (treeWalk.next()) {
ByteArrayOutputStream out = new ByteArrayOutputStream();
ObjectLoader loader = repository.open(treeWalk.getObjectId(0));
loader.copyTo(out);

// setup object mapper to use preconfigured deserializer
// module. Required to deal with type polymorphism
ObjectMapper objectMapper = mapperUtils.getSharedContentObjectMapper();

Content content;
try {
content = (Content) objectMapper.readValue(out.toString(), ContentBase.class);

// check if we only want to index published content
if (!includeUnpublished && !content.getPublished()) {
log.debug("Skipping unpublished content: {}", content.getId());
continue;
}

content = this.augmentChildContent(content, treeWalk.getPathString(), null, content.getPublished());

if (null != content) {
indexContentObject(contentCache, tagsList, allUnits, publishedUnits, indexProblemCache,
treeWalk.getPathString(), content);
}
} catch (JsonMappingException e) {
log.debug(String.format("Unable to parse the json file found %s as a content object. "
+ "Skipping file due to error: \n %s", treeWalk.getPathString(), e.getMessage()));
Content dummyContent = new Content();
dummyContent.setCanonicalSourceFile(treeWalk.getPathString());
this.registerContentProblem(dummyContent, "Index failure - Unable to parse json file found - "
+ treeWalk.getPathString() + ". The following error occurred: " + e.getMessage(), indexProblemCache);
} catch (IOException e) {
log.error("IOException while trying to parse {}", treeWalk.getPathString(), e);
Content dummyContent = new Content();
dummyContent.setCanonicalSourceFile(treeWalk.getPathString());
this.registerContentProblem(dummyContent,
"Index failure - Unable to read the json file found - " + treeWalk.getPathString()
+ ". The following error occurred: " + e.getMessage(), indexProblemCache);
}
processJsonFile(treeWalk, repository, context);
}

repository.close();
Expand All @@ -265,6 +254,54 @@ private synchronized void buildGitContentIndex(final String sha,
}
}

private void processJsonFile(final TreeWalk treeWalk, final Repository repository,
final IndexingContext context) {
try {
ByteArrayOutputStream out = new ByteArrayOutputStream();
ObjectLoader loader = repository.open(treeWalk.getObjectId(0));
loader.copyTo(out);

ObjectMapper objectMapper = mapperUtils.getSharedContentObjectMapper();

try {
Content content = (Content) objectMapper.readValue(out.toString(), ContentBase.class);

if (context.shouldSkipUnpublished(content)) {
log.debug("Skipping unpublished content: {}", content.getId());
return;
}

content = this.augmentChildContent(content, treeWalk.getPathString(), null, content.getPublished());

if (null != content) {
indexContentObject(context.contentCache, context.tagsList, context.allUnits, context.publishedUnits,
context.indexProblemCache, treeWalk.getPathString(), content);
}
} catch (JsonMappingException e) {
log.debug(String.format("Unable to parse the json file found %s as a content object. "
+ "Skipping file due to error: \n %s", treeWalk.getPathString(), e.getMessage()));
Content dummyContent = new Content();
dummyContent.setCanonicalSourceFile(treeWalk.getPathString());
this.registerContentProblem(dummyContent, "Index failure - Unable to parse json file found - "
+ treeWalk.getPathString() + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache);
} catch (IOException e) {
log.error("IOException while trying to parse {}", treeWalk.getPathString(), e);
Content dummyContent = new Content();
dummyContent.setCanonicalSourceFile(treeWalk.getPathString());
this.registerContentProblem(dummyContent,
"Index failure - Unable to read the json file found - " + treeWalk.getPathString()
+ ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache);
}
} catch (Exception e) {
log.error("Unexpected error while processing file {}: {}", treeWalk.getPathString(), e.getMessage(), e);
Content dummyContent = new Content();
dummyContent.setCanonicalSourceFile(treeWalk.getPathString());
this.registerContentProblem(dummyContent,
"Index failure - Unexpected error while processing file - " + treeWalk.getPathString()
+ ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache);
}
}

private void indexContentObject(
final Map<String, Content> contentCache, final Set<String> tagsList, final Map<String, String> allUnits,
final Map<String, String> publishedUnits, final Map<Content, List<String>> indexProblemCache,
Expand All @@ -277,82 +314,69 @@ private void indexContentObject(

// add children (and parent) from flattened Set to
// cache if they have ids
IndexingContext context = new IndexingContext(contentCache, tagsList, allUnits, publishedUnits,
indexProblemCache, true);
for (Content flattenedContent : this.flattenContentObjects(content)) {
if (flattenedContent.getId() == null) {
continue;
}
validateAndCacheContent(flattenedContent, content, treeWalkPath, context);
}
}

// Prevents ETL indexing of quizzes that contain anything that is not an IsaacQuizSection
// in the top-level children array.
// NOTE: I'm not sure if this is the right place for this but I couldn't find a better one.
// This also seems to be the only time we can prevent a file from being indexed entirely.
if (flattenedContent instanceof IsaacQuiz) {
List<ContentBase> children = flattenedContent.getChildren();
if (children.stream().anyMatch(c -> !(c instanceof IsaacQuizSection))) {
log.debug("IsaacQuiz ({}) contains top-level non-quiz sections. Skipping.", flattenedContent.getId());
this.registerContentProblem(flattenedContent, "Index failure - Invalid "
+ "content type among quiz sections. Quizzes can only contain quiz sections "
+ "in the top-level children array.", indexProblemCache);
continue;
}
}
private void validateAndCacheContent(final Content flattenedContent, final Content parentContent,
final String treeWalkPath, final IndexingContext context) {
if (flattenedContent.getId() == null) {
return;
}

if (flattenedContent.getId().length() > MAXIMUM_CONTENT_ID_LENGTH) {
log.debug("Content ID too long: {}", flattenedContent.getId());
this.registerContentProblem(flattenedContent, "Content ID too long: " + flattenedContent.getId(),
indexProblemCache);
continue;
if (flattenedContent instanceof IsaacQuiz) {
List<ContentBase> children = flattenedContent.getChildren();
if (children.stream().anyMatch(c -> !(c instanceof IsaacQuizSection))) {
log.debug("IsaacQuiz ({}) contains top-level non-quiz sections. Skipping.", flattenedContent.getId());
this.registerContentProblem(flattenedContent, "Index failure - Invalid "
+ "content type among quiz sections. Quizzes can only contain quiz sections "
+ "in the top-level children array.", context.indexProblemCache);
return;
}
}

if (flattenedContent.getId().contains(".")) {
// Otherwise, duplicate IDs with different content,
// therefore log an error
log.debug("Resource with invalid ID ({}) detected in cache. Skipping {}", content.getId(), treeWalkPath);

this.registerContentProblem(flattenedContent, "Index failure - Invalid ID "
+ flattenedContent.getId() + " found in file " + treeWalkPath
+ ". Must not contain restricted characters.", indexProblemCache);
continue;
}
if (flattenedContent.getId().length() > MAXIMUM_CONTENT_ID_LENGTH) {
log.debug("Content ID too long: {}", flattenedContent.getId());
this.registerContentProblem(flattenedContent, "Content ID too long: " + flattenedContent.getId(),
context.indexProblemCache);
return;
}

// check if we have seen this key before if
// we have then we don't want to add it
// again
if (!contentCache.containsKey(flattenedContent.getId())) {
// It must be new so we can add it
log.debug("Loading into cache: {} ({}) from {}", flattenedContent.getId(), flattenedContent.getType(),
treeWalkPath);
contentCache.put(flattenedContent.getId(), flattenedContent);
registerTags(flattenedContent.getTags(), tagsList);

// If this is a numeric question, extract any
// units from its answers.

if (flattenedContent instanceof IsaacNumericQuestion) {
registerUnits((IsaacNumericQuestion) flattenedContent, allUnits, publishedUnits);
}
if (flattenedContent.getId().contains(".")) {
log.debug("Resource with invalid ID ({}) detected in cache. Skipping {}", parentContent.getId(), treeWalkPath);
this.registerContentProblem(flattenedContent, "Index failure - Invalid ID "
+ flattenedContent.getId() + " found in file " + treeWalkPath
+ ". Must not contain restricted characters.", context.indexProblemCache);
return;
}

continue; // our work here is done
}
if (!context.contentCache.containsKey(flattenedContent.getId())) {
log.debug("Loading into cache: {} ({}) from {}", flattenedContent.getId(), flattenedContent.getType(),
treeWalkPath);
context.contentCache.put(flattenedContent.getId(), flattenedContent);
registerTags(flattenedContent.getTags(), context.tagsList);

// shaCache contains key already, compare the
// content
if (contentCache.get(flattenedContent.getId()).equals(flattenedContent)) {
// content is the same therefore it is just
// reuse of a content object so that is
// fine.
log.debug("Resource ({}) already seen in cache. Skipping {}", content.getId(), treeWalkPath);
continue;
if (flattenedContent instanceof IsaacNumericQuestion) {
registerUnits((IsaacNumericQuestion) flattenedContent, context.allUnits, context.publishedUnits);
}
return;
}

// Otherwise, duplicate IDs with different content,
// therefore log an error
log.debug("Resource with duplicate ID ({}) detected in cache. Skipping {}", content.getId(), treeWalkPath);
this.registerContentProblem(flattenedContent, String.format(
"Index failure - Duplicate ID (%s) found in files (%s) and (%s): only one will be available.",
content.getId(), treeWalkPath, contentCache.get(flattenedContent.getId()).getCanonicalSourceFile()),
indexProblemCache);
if (context.contentCache.get(flattenedContent.getId()).equals(flattenedContent)) {
log.debug("Resource ({}) already seen in cache. Skipping {}", parentContent.getId(), treeWalkPath);
return;
}

log.debug("Resource with duplicate ID ({}) detected in cache. Skipping {}", parentContent.getId(), treeWalkPath);
this.registerContentProblem(flattenedContent, String.format(
"Index failure - Duplicate ID (%s) found in files (%s) and (%s): only one will be available.",
parentContent.getId(),
treeWalkPath,
context.contentCache.get(flattenedContent.getId()).getCanonicalSourceFile()),
context.indexProblemCache);
}

/**
Expand Down Expand Up @@ -419,44 +443,7 @@ private Content augmentChildContent(final Content content, final String canonica
}

if (content instanceof Question) {
Question question = (Question) content;
if (question.getHints() != null) {
for (ContentBase cb : question.getHints()) {
Content c = (Content) cb;
this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished);
}
}

// Augment question answers
if (question.getAnswer() != null) {
Content answer = (Content) question.getAnswer();
if (answer.getChildren() != null) {
for (ContentBase cb : answer.getChildren()) {
Content c = (Content) cb;
this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished);
}
}
}

if (question.getDefaultFeedback() != null) {
Content defaultFeedback = question.getDefaultFeedback();
if (defaultFeedback.getChildren() != null) {
for (ContentBase cb : defaultFeedback.getChildren()) {
Content c = (Content) cb;
this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished);
}
}
}

if (content instanceof ChoiceQuestion) {
ChoiceQuestion choiceQuestion = (ChoiceQuestion) content;
if (choiceQuestion.getChoices() != null) {
for (ContentBase cb : choiceQuestion.getChoices()) {
Content c = (Content) cb;
this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished);
}
}
}
augmentQuestionContent((Question) content, canonicalSourceFile, newParentId, parentPublished);
}

// try to determine if we have media as fields to deal with in this class
Expand Down Expand Up @@ -521,10 +508,68 @@ private void collateSearchableContent(final Content content, final StringBuilder
/**
* Concatenate the source of a media content object with its parent source file.
*
* @param canonicalSourceFile the canonical path to use for concat operations.
* @param originalSrc to modify
* @param canonicalSourceFile the canonical path to use for concat operations
* @return src with relative paths fixed.
*/
private void augmentQuestionContent(final Question question, final String canonicalSourceFile,
final String newParentId, final boolean parentPublished) {
augmentHints(question, canonicalSourceFile, newParentId, parentPublished);
augmentAnswerContent(question, canonicalSourceFile, newParentId, parentPublished);
augmentFeedbackContent(question, canonicalSourceFile, newParentId, parentPublished);
augmentChoiceQuestionContent(question, canonicalSourceFile, newParentId, parentPublished);
}

private void augmentHints(final Question question, final String canonicalSourceFile, final String newParentId,
final boolean parentPublished) {
if (question.getHints() != null) {
for (ContentBase cb : question.getHints()) {
Content c = (Content) cb;
this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished);
}
}
}

private void augmentAnswerContent(final Question question, final String canonicalSourceFile, final String newParentId,
final boolean parentPublished) {
if (question.getAnswer() != null) {
Content answer = (Content) question.getAnswer();
if (answer.getChildren() != null) {
for (ContentBase cb : answer.getChildren()) {
Content c = (Content) cb;
this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished);
}
}
}
}

private void augmentFeedbackContent(final Question question,
final String canonicalSourceFile,
final String newParentId,
final boolean parentPublished) {
if (question.getDefaultFeedback() != null) {
Content defaultFeedback = question.getDefaultFeedback();
if (defaultFeedback.getChildren() != null) {
for (ContentBase cb : defaultFeedback.getChildren()) {
Content c = (Content) cb;
this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished);
}
}
}
}

private void augmentChoiceQuestionContent(final Question question, final String canonicalSourceFile,
final String newParentId, final boolean parentPublished) {
if (question instanceof ChoiceQuestion) {
ChoiceQuestion choiceQuestion = (ChoiceQuestion) question;
if (choiceQuestion.getChoices() != null) {
for (ContentBase cb : choiceQuestion.getChoices()) {
Content c = (Content) cb;
this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished);
}
}
}
}

private String fixMediaSrc(final String canonicalSourceFile, final String originalSrc) {
if (originalSrc != null && !(originalSrc.startsWith("http://") || originalSrc.startsWith("https://")
|| originalSrc.startsWith("/assets/"))) {
Expand Down
Loading