diff --git a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java index fd83797e34..582f9a6a79 100644 --- a/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java +++ b/src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java @@ -78,6 +78,31 @@ public class ContentIndexer { private static final int MEDIA_FILE_SIZE_LIMIT = 300 * 1024; // Bytes private static final int NANOSECONDS_IN_A_MILLISECOND = 1000000; + private static final String ERROR_OCCURRED_SUFFIX = ". The following error occurred: "; + + private static class IndexingContext { + final Map contentCache; + final Set tagsList; + final Map allUnits; + final Map publishedUnits; + final Map> indexProblemCache; + final boolean includeUnpublished; + + IndexingContext(final Map contentCache, final Set tagsList, + final Map allUnits, final Map publishedUnits, + final Map> indexProblemCache, final boolean includeUnpublished) { + this.contentCache = contentCache; + this.tagsList = tagsList; + this.allUnits = allUnits; + this.publishedUnits = publishedUnits; + this.indexProblemCache = indexProblemCache; + this.includeUnpublished = includeUnpublished; + } + + boolean shouldSkipUnpublished(final Content content) { + return !includeUnpublished && !content.getPublished(); + } + } @Inject public ContentIndexer(final GitDb database, final ElasticSearchIndexer es, final ContentMapperUtils mapperUtils) { @@ -212,46 +237,10 @@ private synchronized void buildGitContentIndex(final String sha, log.info("Populating git content cache based on sha " + sanitiseInternalLogValue(sha) + " ..."); // Traverse the git repository looking for the .json files + IndexingContext context = new IndexingContext(contentCache, tagsList, allUnits, publishedUnits, + indexProblemCache, includeUnpublished); while (treeWalk.next()) { - ByteArrayOutputStream out = new ByteArrayOutputStream(); - ObjectLoader loader = repository.open(treeWalk.getObjectId(0)); - loader.copyTo(out); - - // setup object mapper to use preconfigured deserializer - // module. Required to deal with type polymorphism - ObjectMapper objectMapper = mapperUtils.getSharedContentObjectMapper(); - - Content content; - try { - content = (Content) objectMapper.readValue(out.toString(), ContentBase.class); - - // check if we only want to index published content - if (!includeUnpublished && !content.getPublished()) { - log.debug("Skipping unpublished content: {}", content.getId()); - continue; - } - - content = this.augmentChildContent(content, treeWalk.getPathString(), null, content.getPublished()); - - if (null != content) { - indexContentObject(contentCache, tagsList, allUnits, publishedUnits, indexProblemCache, - treeWalk.getPathString(), content); - } - } catch (JsonMappingException e) { - log.debug(String.format("Unable to parse the json file found %s as a content object. " - + "Skipping file due to error: \n %s", treeWalk.getPathString(), e.getMessage())); - Content dummyContent = new Content(); - dummyContent.setCanonicalSourceFile(treeWalk.getPathString()); - this.registerContentProblem(dummyContent, "Index failure - Unable to parse json file found - " - + treeWalk.getPathString() + ". The following error occurred: " + e.getMessage(), indexProblemCache); - } catch (IOException e) { - log.error("IOException while trying to parse {}", treeWalk.getPathString(), e); - Content dummyContent = new Content(); - dummyContent.setCanonicalSourceFile(treeWalk.getPathString()); - this.registerContentProblem(dummyContent, - "Index failure - Unable to read the json file found - " + treeWalk.getPathString() - + ". The following error occurred: " + e.getMessage(), indexProblemCache); - } + processJsonFile(treeWalk, repository, context); } repository.close(); @@ -265,6 +254,54 @@ private synchronized void buildGitContentIndex(final String sha, } } + private void processJsonFile(final TreeWalk treeWalk, final Repository repository, + final IndexingContext context) { + try { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + ObjectLoader loader = repository.open(treeWalk.getObjectId(0)); + loader.copyTo(out); + + ObjectMapper objectMapper = mapperUtils.getSharedContentObjectMapper(); + + try { + Content content = (Content) objectMapper.readValue(out.toString(), ContentBase.class); + + if (context.shouldSkipUnpublished(content)) { + log.debug("Skipping unpublished content: {}", content.getId()); + return; + } + + content = this.augmentChildContent(content, treeWalk.getPathString(), null, content.getPublished()); + + if (null != content) { + indexContentObject(context.contentCache, context.tagsList, context.allUnits, context.publishedUnits, + context.indexProblemCache, treeWalk.getPathString(), content); + } + } catch (JsonMappingException e) { + log.debug(String.format("Unable to parse the json file found %s as a content object. " + + "Skipping file due to error: \n %s", treeWalk.getPathString(), e.getMessage())); + Content dummyContent = new Content(); + dummyContent.setCanonicalSourceFile(treeWalk.getPathString()); + this.registerContentProblem(dummyContent, "Index failure - Unable to parse json file found - " + + treeWalk.getPathString() + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache); + } catch (IOException e) { + log.error("IOException while trying to parse {}", treeWalk.getPathString(), e); + Content dummyContent = new Content(); + dummyContent.setCanonicalSourceFile(treeWalk.getPathString()); + this.registerContentProblem(dummyContent, + "Index failure - Unable to read the json file found - " + treeWalk.getPathString() + + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache); + } + } catch (Exception e) { + log.error("Unexpected error while processing file {}: {}", treeWalk.getPathString(), e.getMessage(), e); + Content dummyContent = new Content(); + dummyContent.setCanonicalSourceFile(treeWalk.getPathString()); + this.registerContentProblem(dummyContent, + "Index failure - Unexpected error while processing file - " + treeWalk.getPathString() + + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache); + } + } + private void indexContentObject( final Map contentCache, final Set tagsList, final Map allUnits, final Map publishedUnits, final Map> indexProblemCache, @@ -277,82 +314,69 @@ private void indexContentObject( // add children (and parent) from flattened Set to // cache if they have ids + IndexingContext context = new IndexingContext(contentCache, tagsList, allUnits, publishedUnits, + indexProblemCache, true); for (Content flattenedContent : this.flattenContentObjects(content)) { - if (flattenedContent.getId() == null) { - continue; - } + validateAndCacheContent(flattenedContent, content, treeWalkPath, context); + } + } - // Prevents ETL indexing of quizzes that contain anything that is not an IsaacQuizSection - // in the top-level children array. - // NOTE: I'm not sure if this is the right place for this but I couldn't find a better one. - // This also seems to be the only time we can prevent a file from being indexed entirely. - if (flattenedContent instanceof IsaacQuiz) { - List children = flattenedContent.getChildren(); - if (children.stream().anyMatch(c -> !(c instanceof IsaacQuizSection))) { - log.debug("IsaacQuiz ({}) contains top-level non-quiz sections. Skipping.", flattenedContent.getId()); - this.registerContentProblem(flattenedContent, "Index failure - Invalid " - + "content type among quiz sections. Quizzes can only contain quiz sections " - + "in the top-level children array.", indexProblemCache); - continue; - } - } + private void validateAndCacheContent(final Content flattenedContent, final Content parentContent, + final String treeWalkPath, final IndexingContext context) { + if (flattenedContent.getId() == null) { + return; + } - if (flattenedContent.getId().length() > MAXIMUM_CONTENT_ID_LENGTH) { - log.debug("Content ID too long: {}", flattenedContent.getId()); - this.registerContentProblem(flattenedContent, "Content ID too long: " + flattenedContent.getId(), - indexProblemCache); - continue; + if (flattenedContent instanceof IsaacQuiz) { + List children = flattenedContent.getChildren(); + if (children.stream().anyMatch(c -> !(c instanceof IsaacQuizSection))) { + log.debug("IsaacQuiz ({}) contains top-level non-quiz sections. Skipping.", flattenedContent.getId()); + this.registerContentProblem(flattenedContent, "Index failure - Invalid " + + "content type among quiz sections. Quizzes can only contain quiz sections " + + "in the top-level children array.", context.indexProblemCache); + return; } + } - if (flattenedContent.getId().contains(".")) { - // Otherwise, duplicate IDs with different content, - // therefore log an error - log.debug("Resource with invalid ID ({}) detected in cache. Skipping {}", content.getId(), treeWalkPath); - - this.registerContentProblem(flattenedContent, "Index failure - Invalid ID " - + flattenedContent.getId() + " found in file " + treeWalkPath - + ". Must not contain restricted characters.", indexProblemCache); - continue; - } + if (flattenedContent.getId().length() > MAXIMUM_CONTENT_ID_LENGTH) { + log.debug("Content ID too long: {}", flattenedContent.getId()); + this.registerContentProblem(flattenedContent, "Content ID too long: " + flattenedContent.getId(), + context.indexProblemCache); + return; + } - // check if we have seen this key before if - // we have then we don't want to add it - // again - if (!contentCache.containsKey(flattenedContent.getId())) { - // It must be new so we can add it - log.debug("Loading into cache: {} ({}) from {}", flattenedContent.getId(), flattenedContent.getType(), - treeWalkPath); - contentCache.put(flattenedContent.getId(), flattenedContent); - registerTags(flattenedContent.getTags(), tagsList); - - // If this is a numeric question, extract any - // units from its answers. - - if (flattenedContent instanceof IsaacNumericQuestion) { - registerUnits((IsaacNumericQuestion) flattenedContent, allUnits, publishedUnits); - } + if (flattenedContent.getId().contains(".")) { + log.debug("Resource with invalid ID ({}) detected in cache. Skipping {}", parentContent.getId(), treeWalkPath); + this.registerContentProblem(flattenedContent, "Index failure - Invalid ID " + + flattenedContent.getId() + " found in file " + treeWalkPath + + ". Must not contain restricted characters.", context.indexProblemCache); + return; + } - continue; // our work here is done - } + if (!context.contentCache.containsKey(flattenedContent.getId())) { + log.debug("Loading into cache: {} ({}) from {}", flattenedContent.getId(), flattenedContent.getType(), + treeWalkPath); + context.contentCache.put(flattenedContent.getId(), flattenedContent); + registerTags(flattenedContent.getTags(), context.tagsList); - // shaCache contains key already, compare the - // content - if (contentCache.get(flattenedContent.getId()).equals(flattenedContent)) { - // content is the same therefore it is just - // reuse of a content object so that is - // fine. - log.debug("Resource ({}) already seen in cache. Skipping {}", content.getId(), treeWalkPath); - continue; + if (flattenedContent instanceof IsaacNumericQuestion) { + registerUnits((IsaacNumericQuestion) flattenedContent, context.allUnits, context.publishedUnits); } + return; + } - // Otherwise, duplicate IDs with different content, - // therefore log an error - log.debug("Resource with duplicate ID ({}) detected in cache. Skipping {}", content.getId(), treeWalkPath); - this.registerContentProblem(flattenedContent, String.format( - "Index failure - Duplicate ID (%s) found in files (%s) and (%s): only one will be available.", - content.getId(), treeWalkPath, contentCache.get(flattenedContent.getId()).getCanonicalSourceFile()), - indexProblemCache); + if (context.contentCache.get(flattenedContent.getId()).equals(flattenedContent)) { + log.debug("Resource ({}) already seen in cache. Skipping {}", parentContent.getId(), treeWalkPath); + return; } + + log.debug("Resource with duplicate ID ({}) detected in cache. Skipping {}", parentContent.getId(), treeWalkPath); + this.registerContentProblem(flattenedContent, String.format( + "Index failure - Duplicate ID (%s) found in files (%s) and (%s): only one will be available.", + parentContent.getId(), + treeWalkPath, + context.contentCache.get(flattenedContent.getId()).getCanonicalSourceFile()), + context.indexProblemCache); } /** @@ -419,44 +443,7 @@ private Content augmentChildContent(final Content content, final String canonica } if (content instanceof Question) { - Question question = (Question) content; - if (question.getHints() != null) { - for (ContentBase cb : question.getHints()) { - Content c = (Content) cb; - this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished); - } - } - - // Augment question answers - if (question.getAnswer() != null) { - Content answer = (Content) question.getAnswer(); - if (answer.getChildren() != null) { - for (ContentBase cb : answer.getChildren()) { - Content c = (Content) cb; - this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished); - } - } - } - - if (question.getDefaultFeedback() != null) { - Content defaultFeedback = question.getDefaultFeedback(); - if (defaultFeedback.getChildren() != null) { - for (ContentBase cb : defaultFeedback.getChildren()) { - Content c = (Content) cb; - this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished); - } - } - } - - if (content instanceof ChoiceQuestion) { - ChoiceQuestion choiceQuestion = (ChoiceQuestion) content; - if (choiceQuestion.getChoices() != null) { - for (ContentBase cb : choiceQuestion.getChoices()) { - Content c = (Content) cb; - this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished); - } - } - } + augmentQuestionContent((Question) content, canonicalSourceFile, newParentId, parentPublished); } // try to determine if we have media as fields to deal with in this class @@ -521,10 +508,68 @@ private void collateSearchableContent(final Content content, final StringBuilder /** * Concatenate the source of a media content object with its parent source file. * - * @param canonicalSourceFile the canonical path to use for concat operations. - * @param originalSrc to modify + * @param canonicalSourceFile the canonical path to use for concat operations * @return src with relative paths fixed. */ + private void augmentQuestionContent(final Question question, final String canonicalSourceFile, + final String newParentId, final boolean parentPublished) { + augmentHints(question, canonicalSourceFile, newParentId, parentPublished); + augmentAnswerContent(question, canonicalSourceFile, newParentId, parentPublished); + augmentFeedbackContent(question, canonicalSourceFile, newParentId, parentPublished); + augmentChoiceQuestionContent(question, canonicalSourceFile, newParentId, parentPublished); + } + + private void augmentHints(final Question question, final String canonicalSourceFile, final String newParentId, + final boolean parentPublished) { + if (question.getHints() != null) { + for (ContentBase cb : question.getHints()) { + Content c = (Content) cb; + this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished); + } + } + } + + private void augmentAnswerContent(final Question question, final String canonicalSourceFile, final String newParentId, + final boolean parentPublished) { + if (question.getAnswer() != null) { + Content answer = (Content) question.getAnswer(); + if (answer.getChildren() != null) { + for (ContentBase cb : answer.getChildren()) { + Content c = (Content) cb; + this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished); + } + } + } + } + + private void augmentFeedbackContent(final Question question, + final String canonicalSourceFile, + final String newParentId, + final boolean parentPublished) { + if (question.getDefaultFeedback() != null) { + Content defaultFeedback = question.getDefaultFeedback(); + if (defaultFeedback.getChildren() != null) { + for (ContentBase cb : defaultFeedback.getChildren()) { + Content c = (Content) cb; + this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished); + } + } + } + } + + private void augmentChoiceQuestionContent(final Question question, final String canonicalSourceFile, + final String newParentId, final boolean parentPublished) { + if (question instanceof ChoiceQuestion) { + ChoiceQuestion choiceQuestion = (ChoiceQuestion) question; + if (choiceQuestion.getChoices() != null) { + for (ContentBase cb : choiceQuestion.getChoices()) { + Content c = (Content) cb; + this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished); + } + } + } + } + private String fixMediaSrc(final String canonicalSourceFile, final String originalSrc) { if (originalSrc != null && !(originalSrc.startsWith("http://") || originalSrc.startsWith("https://") || originalSrc.startsWith("/assets/"))) {