Skip to content

Commit 818b6c4

Browse files
ETLManager:82 - Indexing version latest failed - #835 (#407)
* ETLManager:82 - Indexing version latest failed - #835
1 parent dfee59b commit 818b6c4

1 file changed

Lines changed: 189 additions & 144 deletions

File tree

src/main/java/uk/ac/cam/cl/dtg/segue/etl/ContentIndexer.java

Lines changed: 189 additions & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,31 @@ public class ContentIndexer {
7878

7979
private static final int MEDIA_FILE_SIZE_LIMIT = 300 * 1024; // Bytes
8080
private static final int NANOSECONDS_IN_A_MILLISECOND = 1000000;
81+
private static final String ERROR_OCCURRED_SUFFIX = ". The following error occurred: ";
82+
83+
private static class IndexingContext {
84+
final Map<String, Content> contentCache;
85+
final Set<String> tagsList;
86+
final Map<String, String> allUnits;
87+
final Map<String, String> publishedUnits;
88+
final Map<Content, List<String>> indexProblemCache;
89+
final boolean includeUnpublished;
90+
91+
IndexingContext(final Map<String, Content> contentCache, final Set<String> tagsList,
92+
final Map<String, String> allUnits, final Map<String, String> publishedUnits,
93+
final Map<Content, List<String>> indexProblemCache, final boolean includeUnpublished) {
94+
this.contentCache = contentCache;
95+
this.tagsList = tagsList;
96+
this.allUnits = allUnits;
97+
this.publishedUnits = publishedUnits;
98+
this.indexProblemCache = indexProblemCache;
99+
this.includeUnpublished = includeUnpublished;
100+
}
101+
102+
boolean shouldSkipUnpublished(final Content content) {
103+
return !includeUnpublished && !content.getPublished();
104+
}
105+
}
81106

82107
@Inject
83108
public ContentIndexer(final GitDb database, final ElasticSearchIndexer es, final ContentMapperUtils mapperUtils) {
@@ -212,46 +237,10 @@ private synchronized void buildGitContentIndex(final String sha,
212237
log.info("Populating git content cache based on sha " + sanitiseInternalLogValue(sha) + " ...");
213238

214239
// Traverse the git repository looking for the .json files
240+
IndexingContext context = new IndexingContext(contentCache, tagsList, allUnits, publishedUnits,
241+
indexProblemCache, includeUnpublished);
215242
while (treeWalk.next()) {
216-
ByteArrayOutputStream out = new ByteArrayOutputStream();
217-
ObjectLoader loader = repository.open(treeWalk.getObjectId(0));
218-
loader.copyTo(out);
219-
220-
// setup object mapper to use preconfigured deserializer
221-
// module. Required to deal with type polymorphism
222-
ObjectMapper objectMapper = mapperUtils.getSharedContentObjectMapper();
223-
224-
Content content;
225-
try {
226-
content = (Content) objectMapper.readValue(out.toString(), ContentBase.class);
227-
228-
// check if we only want to index published content
229-
if (!includeUnpublished && !content.getPublished()) {
230-
log.debug("Skipping unpublished content: {}", content.getId());
231-
continue;
232-
}
233-
234-
content = this.augmentChildContent(content, treeWalk.getPathString(), null, content.getPublished());
235-
236-
if (null != content) {
237-
indexContentObject(contentCache, tagsList, allUnits, publishedUnits, indexProblemCache,
238-
treeWalk.getPathString(), content);
239-
}
240-
} catch (JsonMappingException e) {
241-
log.debug(String.format("Unable to parse the json file found %s as a content object. "
242-
+ "Skipping file due to error: \n %s", treeWalk.getPathString(), e.getMessage()));
243-
Content dummyContent = new Content();
244-
dummyContent.setCanonicalSourceFile(treeWalk.getPathString());
245-
this.registerContentProblem(dummyContent, "Index failure - Unable to parse json file found - "
246-
+ treeWalk.getPathString() + ". The following error occurred: " + e.getMessage(), indexProblemCache);
247-
} catch (IOException e) {
248-
log.error("IOException while trying to parse {}", treeWalk.getPathString(), e);
249-
Content dummyContent = new Content();
250-
dummyContent.setCanonicalSourceFile(treeWalk.getPathString());
251-
this.registerContentProblem(dummyContent,
252-
"Index failure - Unable to read the json file found - " + treeWalk.getPathString()
253-
+ ". The following error occurred: " + e.getMessage(), indexProblemCache);
254-
}
243+
processJsonFile(treeWalk, repository, context);
255244
}
256245

257246
repository.close();
@@ -265,6 +254,54 @@ private synchronized void buildGitContentIndex(final String sha,
265254
}
266255
}
267256

257+
private void processJsonFile(final TreeWalk treeWalk, final Repository repository,
258+
final IndexingContext context) {
259+
try {
260+
ByteArrayOutputStream out = new ByteArrayOutputStream();
261+
ObjectLoader loader = repository.open(treeWalk.getObjectId(0));
262+
loader.copyTo(out);
263+
264+
ObjectMapper objectMapper = mapperUtils.getSharedContentObjectMapper();
265+
266+
try {
267+
Content content = (Content) objectMapper.readValue(out.toString(), ContentBase.class);
268+
269+
if (context.shouldSkipUnpublished(content)) {
270+
log.debug("Skipping unpublished content: {}", content.getId());
271+
return;
272+
}
273+
274+
content = this.augmentChildContent(content, treeWalk.getPathString(), null, content.getPublished());
275+
276+
if (null != content) {
277+
indexContentObject(context.contentCache, context.tagsList, context.allUnits, context.publishedUnits,
278+
context.indexProblemCache, treeWalk.getPathString(), content);
279+
}
280+
} catch (JsonMappingException e) {
281+
log.debug(String.format("Unable to parse the json file found %s as a content object. "
282+
+ "Skipping file due to error: \n %s", treeWalk.getPathString(), e.getMessage()));
283+
Content dummyContent = new Content();
284+
dummyContent.setCanonicalSourceFile(treeWalk.getPathString());
285+
this.registerContentProblem(dummyContent, "Index failure - Unable to parse json file found - "
286+
+ treeWalk.getPathString() + ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache);
287+
} catch (IOException e) {
288+
log.error("IOException while trying to parse {}", treeWalk.getPathString(), e);
289+
Content dummyContent = new Content();
290+
dummyContent.setCanonicalSourceFile(treeWalk.getPathString());
291+
this.registerContentProblem(dummyContent,
292+
"Index failure - Unable to read the json file found - " + treeWalk.getPathString()
293+
+ ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache);
294+
}
295+
} catch (Exception e) {
296+
log.error("Unexpected error while processing file {}: {}", treeWalk.getPathString(), e.getMessage(), e);
297+
Content dummyContent = new Content();
298+
dummyContent.setCanonicalSourceFile(treeWalk.getPathString());
299+
this.registerContentProblem(dummyContent,
300+
"Index failure - Unexpected error while processing file - " + treeWalk.getPathString()
301+
+ ERROR_OCCURRED_SUFFIX + e.getMessage(), context.indexProblemCache);
302+
}
303+
}
304+
268305
private void indexContentObject(
269306
final Map<String, Content> contentCache, final Set<String> tagsList, final Map<String, String> allUnits,
270307
final Map<String, String> publishedUnits, final Map<Content, List<String>> indexProblemCache,
@@ -277,82 +314,69 @@ private void indexContentObject(
277314

278315
// add children (and parent) from flattened Set to
279316
// cache if they have ids
317+
IndexingContext context = new IndexingContext(contentCache, tagsList, allUnits, publishedUnits,
318+
indexProblemCache, true);
280319
for (Content flattenedContent : this.flattenContentObjects(content)) {
281-
if (flattenedContent.getId() == null) {
282-
continue;
283-
}
320+
validateAndCacheContent(flattenedContent, content, treeWalkPath, context);
321+
}
322+
}
284323

285-
// Prevents ETL indexing of quizzes that contain anything that is not an IsaacQuizSection
286-
// in the top-level children array.
287-
// NOTE: I'm not sure if this is the right place for this but I couldn't find a better one.
288-
// This also seems to be the only time we can prevent a file from being indexed entirely.
289-
if (flattenedContent instanceof IsaacQuiz) {
290-
List<ContentBase> children = flattenedContent.getChildren();
291-
if (children.stream().anyMatch(c -> !(c instanceof IsaacQuizSection))) {
292-
log.debug("IsaacQuiz ({}) contains top-level non-quiz sections. Skipping.", flattenedContent.getId());
293-
this.registerContentProblem(flattenedContent, "Index failure - Invalid "
294-
+ "content type among quiz sections. Quizzes can only contain quiz sections "
295-
+ "in the top-level children array.", indexProblemCache);
296-
continue;
297-
}
298-
}
324+
private void validateAndCacheContent(final Content flattenedContent, final Content parentContent,
325+
final String treeWalkPath, final IndexingContext context) {
326+
if (flattenedContent.getId() == null) {
327+
return;
328+
}
299329

300-
if (flattenedContent.getId().length() > MAXIMUM_CONTENT_ID_LENGTH) {
301-
log.debug("Content ID too long: {}", flattenedContent.getId());
302-
this.registerContentProblem(flattenedContent, "Content ID too long: " + flattenedContent.getId(),
303-
indexProblemCache);
304-
continue;
330+
if (flattenedContent instanceof IsaacQuiz) {
331+
List<ContentBase> children = flattenedContent.getChildren();
332+
if (children.stream().anyMatch(c -> !(c instanceof IsaacQuizSection))) {
333+
log.debug("IsaacQuiz ({}) contains top-level non-quiz sections. Skipping.", flattenedContent.getId());
334+
this.registerContentProblem(flattenedContent, "Index failure - Invalid "
335+
+ "content type among quiz sections. Quizzes can only contain quiz sections "
336+
+ "in the top-level children array.", context.indexProblemCache);
337+
return;
305338
}
339+
}
306340

307-
if (flattenedContent.getId().contains(".")) {
308-
// Otherwise, duplicate IDs with different content,
309-
// therefore log an error
310-
log.debug("Resource with invalid ID ({}) detected in cache. Skipping {}", content.getId(), treeWalkPath);
311-
312-
this.registerContentProblem(flattenedContent, "Index failure - Invalid ID "
313-
+ flattenedContent.getId() + " found in file " + treeWalkPath
314-
+ ". Must not contain restricted characters.", indexProblemCache);
315-
continue;
316-
}
341+
if (flattenedContent.getId().length() > MAXIMUM_CONTENT_ID_LENGTH) {
342+
log.debug("Content ID too long: {}", flattenedContent.getId());
343+
this.registerContentProblem(flattenedContent, "Content ID too long: " + flattenedContent.getId(),
344+
context.indexProblemCache);
345+
return;
346+
}
317347

318-
// check if we have seen this key before if
319-
// we have then we don't want to add it
320-
// again
321-
if (!contentCache.containsKey(flattenedContent.getId())) {
322-
// It must be new so we can add it
323-
log.debug("Loading into cache: {} ({}) from {}", flattenedContent.getId(), flattenedContent.getType(),
324-
treeWalkPath);
325-
contentCache.put(flattenedContent.getId(), flattenedContent);
326-
registerTags(flattenedContent.getTags(), tagsList);
327-
328-
// If this is a numeric question, extract any
329-
// units from its answers.
330-
331-
if (flattenedContent instanceof IsaacNumericQuestion) {
332-
registerUnits((IsaacNumericQuestion) flattenedContent, allUnits, publishedUnits);
333-
}
348+
if (flattenedContent.getId().contains(".")) {
349+
log.debug("Resource with invalid ID ({}) detected in cache. Skipping {}", parentContent.getId(), treeWalkPath);
350+
this.registerContentProblem(flattenedContent, "Index failure - Invalid ID "
351+
+ flattenedContent.getId() + " found in file " + treeWalkPath
352+
+ ". Must not contain restricted characters.", context.indexProblemCache);
353+
return;
354+
}
334355

335-
continue; // our work here is done
336-
}
356+
if (!context.contentCache.containsKey(flattenedContent.getId())) {
357+
log.debug("Loading into cache: {} ({}) from {}", flattenedContent.getId(), flattenedContent.getType(),
358+
treeWalkPath);
359+
context.contentCache.put(flattenedContent.getId(), flattenedContent);
360+
registerTags(flattenedContent.getTags(), context.tagsList);
337361

338-
// shaCache contains key already, compare the
339-
// content
340-
if (contentCache.get(flattenedContent.getId()).equals(flattenedContent)) {
341-
// content is the same therefore it is just
342-
// reuse of a content object so that is
343-
// fine.
344-
log.debug("Resource ({}) already seen in cache. Skipping {}", content.getId(), treeWalkPath);
345-
continue;
362+
if (flattenedContent instanceof IsaacNumericQuestion) {
363+
registerUnits((IsaacNumericQuestion) flattenedContent, context.allUnits, context.publishedUnits);
346364
}
365+
return;
366+
}
347367

348-
// Otherwise, duplicate IDs with different content,
349-
// therefore log an error
350-
log.debug("Resource with duplicate ID ({}) detected in cache. Skipping {}", content.getId(), treeWalkPath);
351-
this.registerContentProblem(flattenedContent, String.format(
352-
"Index failure - Duplicate ID (%s) found in files (%s) and (%s): only one will be available.",
353-
content.getId(), treeWalkPath, contentCache.get(flattenedContent.getId()).getCanonicalSourceFile()),
354-
indexProblemCache);
368+
if (context.contentCache.get(flattenedContent.getId()).equals(flattenedContent)) {
369+
log.debug("Resource ({}) already seen in cache. Skipping {}", parentContent.getId(), treeWalkPath);
370+
return;
355371
}
372+
373+
log.debug("Resource with duplicate ID ({}) detected in cache. Skipping {}", parentContent.getId(), treeWalkPath);
374+
this.registerContentProblem(flattenedContent, String.format(
375+
"Index failure - Duplicate ID (%s) found in files (%s) and (%s): only one will be available.",
376+
parentContent.getId(),
377+
treeWalkPath,
378+
context.contentCache.get(flattenedContent.getId()).getCanonicalSourceFile()),
379+
context.indexProblemCache);
356380
}
357381

358382
/**
@@ -419,44 +443,7 @@ private Content augmentChildContent(final Content content, final String canonica
419443
}
420444

421445
if (content instanceof Question) {
422-
Question question = (Question) content;
423-
if (question.getHints() != null) {
424-
for (ContentBase cb : question.getHints()) {
425-
Content c = (Content) cb;
426-
this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished);
427-
}
428-
}
429-
430-
// Augment question answers
431-
if (question.getAnswer() != null) {
432-
Content answer = (Content) question.getAnswer();
433-
if (answer.getChildren() != null) {
434-
for (ContentBase cb : answer.getChildren()) {
435-
Content c = (Content) cb;
436-
this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished);
437-
}
438-
}
439-
}
440-
441-
if (question.getDefaultFeedback() != null) {
442-
Content defaultFeedback = question.getDefaultFeedback();
443-
if (defaultFeedback.getChildren() != null) {
444-
for (ContentBase cb : defaultFeedback.getChildren()) {
445-
Content c = (Content) cb;
446-
this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished);
447-
}
448-
}
449-
}
450-
451-
if (content instanceof ChoiceQuestion) {
452-
ChoiceQuestion choiceQuestion = (ChoiceQuestion) content;
453-
if (choiceQuestion.getChoices() != null) {
454-
for (ContentBase cb : choiceQuestion.getChoices()) {
455-
Content c = (Content) cb;
456-
this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished);
457-
}
458-
}
459-
}
446+
augmentQuestionContent((Question) content, canonicalSourceFile, newParentId, parentPublished);
460447
}
461448

462449
// try to determine if we have media as fields to deal with in this class
@@ -521,10 +508,68 @@ private void collateSearchableContent(final Content content, final StringBuilder
521508
/**
522509
* Concatenate the source of a media content object with its parent source file.
523510
*
524-
* @param canonicalSourceFile the canonical path to use for concat operations.
525-
* @param originalSrc to modify
511+
* @param canonicalSourceFile the canonical path to use for concat operations
526512
* @return src with relative paths fixed.
527513
*/
514+
private void augmentQuestionContent(final Question question, final String canonicalSourceFile,
515+
final String newParentId, final boolean parentPublished) {
516+
augmentHints(question, canonicalSourceFile, newParentId, parentPublished);
517+
augmentAnswerContent(question, canonicalSourceFile, newParentId, parentPublished);
518+
augmentFeedbackContent(question, canonicalSourceFile, newParentId, parentPublished);
519+
augmentChoiceQuestionContent(question, canonicalSourceFile, newParentId, parentPublished);
520+
}
521+
522+
private void augmentHints(final Question question, final String canonicalSourceFile, final String newParentId,
523+
final boolean parentPublished) {
524+
if (question.getHints() != null) {
525+
for (ContentBase cb : question.getHints()) {
526+
Content c = (Content) cb;
527+
this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished);
528+
}
529+
}
530+
}
531+
532+
private void augmentAnswerContent(final Question question, final String canonicalSourceFile, final String newParentId,
533+
final boolean parentPublished) {
534+
if (question.getAnswer() != null) {
535+
Content answer = (Content) question.getAnswer();
536+
if (answer.getChildren() != null) {
537+
for (ContentBase cb : answer.getChildren()) {
538+
Content c = (Content) cb;
539+
this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished);
540+
}
541+
}
542+
}
543+
}
544+
545+
private void augmentFeedbackContent(final Question question,
546+
final String canonicalSourceFile,
547+
final String newParentId,
548+
final boolean parentPublished) {
549+
if (question.getDefaultFeedback() != null) {
550+
Content defaultFeedback = question.getDefaultFeedback();
551+
if (defaultFeedback.getChildren() != null) {
552+
for (ContentBase cb : defaultFeedback.getChildren()) {
553+
Content c = (Content) cb;
554+
this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished);
555+
}
556+
}
557+
}
558+
}
559+
560+
private void augmentChoiceQuestionContent(final Question question, final String canonicalSourceFile,
561+
final String newParentId, final boolean parentPublished) {
562+
if (question instanceof ChoiceQuestion) {
563+
ChoiceQuestion choiceQuestion = (ChoiceQuestion) question;
564+
if (choiceQuestion.getChoices() != null) {
565+
for (ContentBase cb : choiceQuestion.getChoices()) {
566+
Content c = (Content) cb;
567+
this.augmentChildContent(c, canonicalSourceFile, newParentId, parentPublished);
568+
}
569+
}
570+
}
571+
}
572+
528573
private String fixMediaSrc(final String canonicalSourceFile, final String originalSrc) {
529574
if (originalSrc != null && !(originalSrc.startsWith("http://") || originalSrc.startsWith("https://")
530575
|| originalSrc.startsWith("/assets/"))) {

0 commit comments

Comments
 (0)