Untitled diff

Erstellt Diff läuft nie ab
58 Entfernungen
91 Zeilen
72 Hinzufügungen
113 Zeilen
/*
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* (the "License"); you may not use this file except csvModel compliance with
* the License. You may obtain a copy of the License at
* the License. You may obtain a copy of the License at
*
*
* http://www.apache.org/licenses/LICENSE-2.0
* http://www.apache.org/licenses/LICENSE-2.0
*
*
* Unless required by applicable law or agreed to in writing, software
* Unless required by applicable law or agreed to csvModel writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* See the License for the specific language governing permissions and
* limitations under the License.
* limitations under the License.
*/
*/
package org.apache.any23.cli.flows;
package org.apache.any23.cli.flows;



import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.cli.ExtractorsFlowTest;
import org.apache.any23.extractor.*;
import org.apache.any23.vocab.CSV;
import org.apache.any23.vocab.CSV;
import org.apache.any23.writer.CompositeTripleHandler;
import org.apache.any23.writer.TripleHandler;
import org.apache.any23.writer.TripleHandlerException;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.codec.digest.DigestUtils;
import org.eclipse.rdf4j.model.*;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Literal;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.ValueFactory;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.model.impl.TreeModel;
import org.eclipse.rdf4j.model.util.Models;
import org.eclipse.rdf4j.model.util.Models;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
import org.slf4j.Logger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.LoggerFactory;


import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Optional;
import java.util.Collections;
import java.util.Set;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Collectors;


/**
/**
* Proof of concept for ANY23-396 example.
* Proof of concept for ANY23-396 example.
*/
*/
public class PeopleExtractor implements Extractor.ModelExtractor {
public class PeopleExtractor extends CompositeTripleHandler {


private Logger log = LoggerFactory.getLogger(PeopleExtractor.class);
private Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());


private static final String RAW_NS = "urn:dataser:raw/";
private static final CSV csv = CSV.getInstance();
private CSV csv = CSV.getInstance();
private static final ValueFactory vf = SimpleValueFactory.getInstance();
private ValueFactory vf = SimpleValueFactory.getInstance();
public static final String RAW_NS = "urn:dataser:raw/";
private static final IRI RAW_FIRST_NAME = vf.createIRI(RAW_NS, "FirstName");
private static final IRI RAW_LAST_NAME = vf.createIRI(RAW_NS, "LastName");

private static final String NAMESPACE = "http://supercustom.net/ontology/";
private static final IRI PERSON = vf.createIRI(NAMESPACE, "Person");
private static final IRI FULL_NAME = vf.createIRI(NAMESPACE, "fullName");
private static final IRI HASH = vf.createIRI(NAMESPACE, "hash");

public static Model createPerson(String fullName) {
IRI s = vf.createIRI("http://rdf.supercustom.net/data/", DigestUtils.sha1Hex(fullName));
Model model = new TreeModel();
model.add(s, RDF.TYPE, PERSON);
model.add(s, FULL_NAME, vf.createLiteral(fullName));
model.add(s, HASH, vf.createLiteral(s.getLocalName(), XMLSchema.HEXBINARY));
return model;
};

private final Model csvModel = new TreeModel();

public PeopleExtractor(TripleHandler delegate) {
super(Collections.singletonList(delegate));
}


@Override
@Override
public void run(ExtractionParameters extractionParameters, ExtractionContext context, Model in, ExtractionResult out) throws IOException, ExtractionException {
public void receiveTriple(Resource s, IRI p, Value o, IRI g, ExtractionContext context) throws TripleHandlerException {
if (in.isEmpty()) {
if ("csv".equals(context.getExtractorName())) {
throw new ExtractionException("model is empty ");
csvModel.add(s, p, o, vf.createIRI(context.getUniqueID()));
} else {
super.receiveTriple(s, p, o, g, context);
}
}

}


//for reach row
@Override
Set<Resource> subjects = in.filter(null, RDF.TYPE, csv.rowType)
public void closeContext(ExtractionContext context) throws TripleHandlerException {
.stream()
Set<Resource> subjects = csvModel.filter(null, RDF.TYPE, csv.rowType)
.map( s -> {return s.getSubject(); }) // get subjects from each triple
.stream().map(Statement::getSubject).collect(Collectors.toSet());
.collect(Collectors.toSet());


log.debug("List of rows: {}", subjects);
log.debug("List of rows: {}", subjects);


subjects.stream()
for (Resource rowId : subjects) {
.forEach( rowId -> {
String firstName = Models.objectLiteral(csvModel.filter(rowId, RAW_FIRST_NAME, null))
String firstName = "";
.map(Literal::getLabel).orElse("");
Optional<Literal> firstNameO = Models.objectLiteral(in.filter(rowId, vf.createIRI(RAW_NS, "FirstName"), null));
if (firstNameO.isPresent()) {
firstName = firstNameO.get().stringValue();
}


String lastName = "";
String lastName = Models.objectLiteral(csvModel.filter(rowId, RAW_LAST_NAME, null))
Optional<Literal> lastNameO = Models.objectLiteral(in.filter(rowId, vf.createIRI(RAW_NS, "LastName"), null));
.map(Literal::getLabel).orElse("");
if (lastNameO.isPresent()) {
lastName = lastNameO.get().stringValue();
}


String fullName = firstName + " " + lastName;
String fullName = firstName + " " + lastName;


IRI personID = ExtractorsFlowTest.personIRIFactory.apply(fullName);
for (Statement s : createPerson(fullName)) {
out.writeTriple(personID, RDF.TYPE, ExtractorsFlowTest.PERSON);
super.receiveTriple(s.getSubject(), s.getPredicate(), s.getObject(), null, context);
out.writeTriple(personID, ExtractorsFlowTest.FULL_NAME, vf.createLiteral(fullName));
}
out.writeTriple(personID, ExtractorsFlowTest.HASH, vf.createLiteral(DigestUtils.sha1Hex(fullName), XMLSchema.HEXBINARY));
}
});

csvModel.clear();


super.closeContext(context);
}
}


@Override
public ExtractorDescription getDescription() {
return PeopleExtractorFactory.getDescriptionInstance();
}
}
}