Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.collections4.CollectionUtils;

import org.grobid.core.GrobidModel;
import org.grobid.core.GrobidModels;
import org.grobid.core.data.BibDataSet;
import org.grobid.core.data.BiblioItem;
Expand Down Expand Up @@ -57,6 +58,11 @@ public CitationParser(EngineParsers parsers) {
this.parsers = parsers;
}

CitationParser(EngineParsers parsers, GrobidModel model) {
super(model);
this.parsers = parsers;
}

/**
* Process one single raw reference string
*/
Expand All @@ -80,15 +86,19 @@ public List<BiblioItem> processingStringMultiple(List<String> inputs, int consol
return null;
List<List<LayoutToken>> tokenList = new ArrayList<>();
for(String input : inputs) {
if (StringUtils.isBlank(input))
// normalize first so non-breaking spaces etc. become regular spaces
input = UnicodeUtil.normaliseText(input);
if (StringUtils.isBlank(input))
tokenList.add(new ArrayList<LayoutToken>());
Comment thread
lfoppiano marked this conversation as resolved.
else {
// some cleaning
input = UnicodeUtil.normaliseText(input);
input = TextUtilities.removeLeadingAndTrailingChars(input, "[({.,])}: \n"," \n");
List<LayoutToken> tokens = analyzer.tokenizeWithLayoutToken(input);
tokens = analyzer.retokenizeSubdigitsFromLayoutToken(tokens);
tokenList.add(tokens);
if (StringUtils.isBlank(input)) {
tokenList.add(new ArrayList<LayoutToken>());
} else {
List<LayoutToken> tokens = analyzer.tokenizeWithLayoutToken(input);
tokens = analyzer.retokenizeSubdigitsFromLayoutToken(tokens);
tokenList.add(tokens);
}
Comment thread
lfoppiano marked this conversation as resolved.
}
}

Expand All @@ -98,7 +108,7 @@ public List<BiblioItem> processingStringMultiple(List<String> inputs, int consol
int i = 0;
for (BiblioItem result : results) {
if (result != null) {
String localInput = inputs.get(i);
String localInput = UnicodeUtil.normaliseText(inputs.get(i));
localInput = TextUtilities.removeLeadingAndTrailingChars(localInput, "[({.,])}: \n"," \n");
result.setReference(localInput);
}
Expand Down Expand Up @@ -180,7 +190,7 @@ public List<BiblioItem> processingLayoutTokenMultiple(List<List<LayoutToken>> to
int i = 0;
for (List<LayoutToken> tokens : tokenList) {
if (CollectionUtils.isEmpty(tokens))
results.add(null);
results.add(new BiblioItem());
Comment thread
lfoppiano marked this conversation as resolved.
else {
String res = resBlocks[i];
i++;
Expand Down Expand Up @@ -260,6 +270,8 @@ public List<BibDataSet> processingReferenceSection(String referenceTextBlock, Re
}

List<BiblioItem> bibList = processingLayoutTokenMultiple(allRefBlocks, 0);
if (bibList == null)
return results;
int i = 0;
for (LabeledReferenceResult ref : segm) {
if (ref.getTokens() == null || ref.getTokens().size() == 0)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.grobid.core.engines;

import nu.xom.Element;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.tuple.MutablePair;
import org.apache.commons.lang3.tuple.MutableTriple;
import org.apache.commons.lang3.tuple.Pair;
Expand Down Expand Up @@ -165,7 +166,7 @@ public List<BiblioItem> processRawReferences(List<String> references, int consol
return finalResults;

List<BiblioItem> results = parsers.getCitationParser().processingStringMultiple(references, 0);
if (results.size() == 0)
if (CollectionUtils.isEmpty(results))
return finalResults;
Comment thread
lfoppiano marked this conversation as resolved.

// consolidation in a second stage to take advantage of parallel calls
Expand All @@ -175,6 +176,8 @@ public List<BiblioItem> processRawReferences(List<String> references, int consol
// prepare for set consolidation
List<BibDataSet> bibDataSetResults = new ArrayList<BibDataSet>();
for (BiblioItem bib : results) {
if (bib == null)
continue;
BibDataSet bds = new BibDataSet();
bds.setResBib(bib);
bds.setRawBib(bib.getReference());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -824,29 +824,31 @@ public String extractAllReferencesString(List<String> texts,
if (articles != null && allReferencesNPL != null && allReferencesNPL.size()>0) {
int k = 0;
List<BiblioItem> bibResults = parsers.getCitationParser().processingStringMultiple(allReferencesNPL, consolidate);
for (String ref : allReferencesNPL) {
BiblioItem result = bibResults.get(k);
if (result == null) {
if (bibResults != null) {
for (String ref : allReferencesNPL) {
BiblioItem result = bibResults.get(k);
if (result == null || result.rejectAsReference()) {
k++;
continue;
}
BibDataSet bds = new BibDataSet();
result.setReference(ref);
bds.setResBib(result);
bds.setRawBib(ref);
bds.addOffset(allOffsetsNPL.get(k).intValue());
bds.setConfidence(allProbNPL.get(k).doubleValue());
articles.add(bds);
Comment thread
lfoppiano marked this conversation as resolved.
//allIndexSegmentNPL.add(localIndexSegmentNPL.get(k));

List<BibDataSet> localList = articlesBySegment.get(localIndexSegmentNPL.get(k));
if (localList == null) {
localList = new ArrayList<>();
}
localList.add(bds);
articlesBySegment.put(localIndexSegmentNPL.get(k), localList);

k++;
continue;
}
BibDataSet bds = new BibDataSet();
result.setReference(ref);
bds.setResBib(result);
bds.setRawBib(ref);
bds.addOffset(allOffsetsNPL.get(k).intValue());
bds.setConfidence(allProbNPL.get(k).doubleValue());
articles.add(bds);
//allIndexSegmentNPL.add(localIndexSegmentNPL.get(k));

List<BibDataSet> localList = articlesBySegment.get(localIndexSegmentNPL.get(k));
if (localList == null) {
localList = new ArrayList<>();
}
localList.add(bds);
articlesBySegment.put(localIndexSegmentNPL.get(k), localList);

k++;
}
}
} catch (Exception e) {
Expand Down Expand Up @@ -1367,20 +1369,22 @@ public String annotateAllReferences(Document doc,
if (articles != null) {
int k = 0;
List<BiblioItem> bibResults = parsers.getCitationParser().processingStringMultiple(referencesNPL, consolidate);
for (String ref : referencesNPL) {
BiblioItem result = bibResults.get(k);
if (result == null) {
if (bibResults != null) {
for (String ref : referencesNPL) {
BiblioItem result = bibResults.get(k);
if (result == null || result.rejectAsReference()) {
k++;
continue;
}
BibDataSet bds = new BibDataSet();
result.setReference(ref);
bds.setResBib(result);
bds.setRawBib(ref);
bds.addOffset(offsets_NPL.get(k).intValue());
//bds.setConfidence(probNPL.get(k).doubleValue());
articles.add(bds);
k++;
Comment thread
lfoppiano marked this conversation as resolved.
continue;
}
BibDataSet bds = new BibDataSet();
result.setReference(ref);
bds.setResBib(result);
bds.setRawBib(ref);
bds.addOffset(offsets_NPL.get(k).intValue());
//bds.setConfidence(probNPL.get(k).doubleValue());
articles.add(bds);
k++;
}
}
} catch (Exception e) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package org.grobid.core.engines;

import org.grobid.core.GrobidModels;
import org.grobid.core.data.BiblioItem;
import org.grobid.core.engines.config.GrobidAnalysisConfig;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.lexicon.Lexicon;
import org.grobid.core.utilities.GrobidConfig;
import org.grobid.core.utilities.GrobidProperties;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.easymock.EasyMock;
import org.powermock.api.easymock.PowerMock;
import org.powermock.core.classloader.annotations.PrepareForTest;
import org.powermock.modules.junit4.PowerMockRunner;
import org.powermock.reflect.Whitebox;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.notNullValue;
import static org.hamcrest.CoreMatchers.nullValue;
import static org.junit.Assert.assertThat;

@RunWith(PowerMockRunner.class)
@PrepareForTest(Lexicon.class)
public class CitationParserNullHandlingTest {

private CitationParser target;

@Before
public void setUp() throws Exception {
PowerMock.mockStatic(Lexicon.class);
Lexicon mockLexicon = EasyMock.createNiceMock(Lexicon.class);
EasyMock.expect(Lexicon.getInstance()).andReturn(mockLexicon).anyTimes();
PowerMock.replay(Lexicon.class);
GrobidConfig.ModelParameters modelParameters = new GrobidConfig.ModelParameters();
modelParameters.name = "bao";
GrobidProperties.addModel(modelParameters);

// Initialize minimal grobidConfig so BiblioItem serialization methods work
GrobidConfig config = new GrobidConfig();
config.grobid = new GrobidConfig.GrobidParameters();
config.grobid.languageDetectorFactory = "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory";
Whitebox.setInternalState(GrobidProperties.class, "grobidConfig", config);

target = new CitationParser(null, GrobidModels.DUMMY);
Comment thread
lfoppiano marked this conversation as resolved.
}

@After
public void tearDown() throws Exception {
Whitebox.setInternalState(GrobidProperties.class, "grobidConfig", (GrobidConfig) null);
}

@Test
public void processingStringMultiple_nullInput_returnsNull() {
List<BiblioItem> result = target.processingStringMultiple(null, 0);
assertThat(result, is(nullValue()));
}

@Test
public void processingStringMultiple_emptyList_returnsNull() {
List<BiblioItem> result = target.processingStringMultiple(new ArrayList<>(), 0);
assertThat(result, is(nullValue()));
}

@Test
public void processingStringMultiple_allBlankStrings_returnsNull() {
List<String> inputs = Arrays.asList("", " ", "\t");
List<BiblioItem> result = target.processingStringMultiple(inputs, 0);
assertThat(result, is(nullValue()));
}

@Test
public void processingStringMultiple_nbspOnlyStrings_returnsNull() {
List<String> inputs = Arrays.asList("\u00A0", "\u00A0\u00A0\u00A0");
List<BiblioItem> result = target.processingStringMultiple(inputs, 0);
assertThat(result, is(nullValue()));
}

@Test
public void processingLayoutTokenMultiple_allEmptyTokenLists_returnsNull() {
List<List<LayoutToken>> tokenList = new ArrayList<>();
tokenList.add(new ArrayList<>());
tokenList.add(new ArrayList<>());
List<BiblioItem> result = target.processingLayoutTokenMultiple(tokenList, 0);
assertThat(result, is(nullValue()));
}

@Test
public void emptyBiblioItem_serializesSafely() {
BiblioItem empty = new BiblioItem();
GrobidAnalysisConfig config = GrobidAnalysisConfig.defaultInstance();

String bibtex = empty.toBibTeX("0", config);
assertThat(bibtex, is(notNullValue()));

String tei = empty.toTEI(0, config);
assertThat(tei, is(notNullValue()));
}

@Test
public void emptyBiblioItem_isRejectedAsReference() {
BiblioItem empty = new BiblioItem();
assertThat(empty.rejectAsReference(), is(true));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -326,15 +326,25 @@ public Response processCitationList(List<String> citations, GrobidAnalysisConfig
} else if (expectedResponseType == ExpectedResponseType.BIBTEX) {
StringBuilder responseContent = new StringBuilder();
int n = 0;
int appended = 0;
for(BiblioItem biblioItem : biblioItems) {
if (biblioItem == null) {
n++;
continue;
}
responseContent.append(biblioItem.toBibTeX(""+n, config));
responseContent.append("\n");
Comment thread
lfoppiano marked this conversation as resolved.
n++;
appended++;
}
if (appended == 0) {
response = Response.status(Status.NO_CONTENT).build();
} else {
response = Response.status(Status.OK)
.entity(responseContent.toString())
.header(HttpHeaders.CONTENT_TYPE, BibTexMediaType.MEDIA_TYPE + "; charset=UTF-8")
.build();
}
response = Response.status(Status.OK)
.entity(responseContent.toString())
.header(HttpHeaders.CONTENT_TYPE, BibTexMediaType.MEDIA_TYPE + "; charset=UTF-8")
.build();
} else {
StringBuilder responseContent = new StringBuilder();
// add some TEI envelop
Expand All @@ -344,16 +354,26 @@ public Response processCitationList(List<String> citations, GrobidAnalysisConfig
responseContent.append("\t<teiHeader/>\n\t<text>\n\t\t<front/>\n\t\t" +
"<body/>\n\t\t<back>\n\t\t\t<div>\n\t\t\t\t<listBibl>\n");
int n = 0;
int appended = 0;
for(BiblioItem biblioItem : biblioItems) {
if (biblioItem == null) {
n++;
continue;
}
responseContent.append(biblioItem.toTEI(n, config));
responseContent.append("\n");
Comment thread
lfoppiano marked this conversation as resolved.
n++;
appended++;
}
if (appended == 0) {
response = Response.status(Status.NO_CONTENT).build();
} else {
responseContent.append("\t\t\t\t</listBibl>\n\t\t\t</div>\n\t\t</back>\n\t</text>\n</TEI>\n");
response = Response.status(Status.OK)
.entity(responseContent.toString())
.header(HttpHeaders.CONTENT_TYPE, MediaType.APPLICATION_XML + "; charset=UTF-8")
.build();
}
responseContent.append("\t\t\t\t</listBibl>\n\t\t\t</div>\n\t\t</back>\n\t</text>\n</TEI>\n");
response = Response.status(Status.OK)
.entity(responseContent.toString())
.header(HttpHeaders.CONTENT_TYPE, MediaType.APPLICATION_XML + "; charset=UTF-8")
.build();
}
} catch (NoSuchElementException nseExp) {
LOGGER.error("Could not get an engine from the pool within configured time. Sending service unavailable.");
Expand Down
Loading