public class TermSuitePipeline
extends java.lang.Object
Modifier and Type | Method and Description |
---|---|
TermSuitePipeline |
aeChineseTokenizer()
Tokenizer for chinese collections.
|
TermSuitePipeline |
aeCompostSplitter() |
TermSuitePipeline |
aeCompoundSplitter()
Deprecated.
Use
aeCompostSplitter() instead |
TermSuitePipeline |
aeContextualizer(int scope,
boolean allTerms)
Computes the
Contextualizer vector of all
single-word terms in the term index. |
TermSuitePipeline |
aeExtensionDetector()
Detects all inclusion/extension relation between terms that have size >= 2.
|
TermSuitePipeline |
aeGraphicalVariantGatherer() |
TermSuitePipeline |
aeMateTaggerLemmatizer() |
TermSuitePipeline |
aeMaxSizeThresholdCleaner(TermProperty property,
int maxSize) |
TermSuitePipeline |
aeNeoClassicalSplitter()
Deprecated.
Use
aeCompostSplitter() instead |
TermSuitePipeline |
aePrefixSplitter()
Deprecated.
Use
aeCompostSplitter() instead |
TermSuitePipeline |
aePrimaryOccurrenceDetector(int detectionStrategy) |
TermSuitePipeline |
aeRegexSpotter()
The single-word and multi-word term spotter AE
base on UIMA Tokens Regex.
|
TermSuitePipeline |
aeSpecificityComputer()
Computes
TermProperty.WR values (and additional
term properties of type TermProperty in the future). |
TermSuitePipeline |
aeStemmer() |
TermSuitePipeline |
aeStopWordsFilter()
Removes from the term index any term having a
stop word at its boundaries.
|
TermSuitePipeline |
aeSyntacticVariantGatherer()
Gathers terms according to their syntactic structures.
|
TermSuitePipeline |
aeTermClassifier(TermProperty sortingProperty) |
TermSuitePipeline |
aeThresholdCleaner(TermProperty property,
float threshold) |
TermSuitePipeline |
aeThresholdCleaner(TermProperty property,
float threshold,
boolean isPeriodic,
int cleaningPeriod,
int termIndexSizeTrigger) |
TermSuitePipeline |
aeThresholdCleanerPeriodic(TermProperty property,
float threshold,
int cleaningPeriod) |
TermSuitePipeline |
aeThresholdCleanerSizeTrigger(TermProperty property,
float threshold,
int termIndexSizeTrigger) |
TermSuitePipeline |
aeTopNCleaner(TermProperty property,
int n) |
TermSuitePipeline |
aeTopNCleanerPeriodic(TermProperty property,
int n,
boolean isPeriodic,
int cleaningPeriod) |
TermSuitePipeline |
aeTreeTagger() |
TermSuitePipeline |
aeUrlFilter()
Filters out URLs from CAS.
|
TermSuitePipeline |
aeWordTokenizer() |
static TermSuitePipeline |
create(java.lang.String lang) |
static TermSuitePipeline |
create(TermIndex termIndex) |
org.apache.uima.analysis_engine.AnalysisEngineDescription |
createDescription() |
TermSuitePipeline |
emptyCollection() |
TermSuitePipeline |
emptyTermIndex(java.lang.String name)
Creates a new in-memory
TermIndex on which this
piepline with run. |
TermSuitePipeline |
enableSyntacticLabels() |
TermIndex |
getTermIndex()
Returns the term index produced (or last modified) by this pipeline.
|
TermSuitePipeline |
haeCasStatCounter(java.lang.String statName) |
TermSuitePipeline |
haeCompoundExporter(java.lang.String toFilePath)
Exports all compound words of the terminology to given file path.
|
TermSuitePipeline |
haeEval(java.lang.String refFileURI,
java.lang.String outputFile,
java.lang.String customLogHeader,
java.lang.String rFile,
java.lang.String evalTraceName,
boolean rtlWithVariants) |
TermSuitePipeline |
haeEvalExporter(java.lang.String toFilePath,
boolean withVariants) |
TermSuitePipeline |
haeExportVariationRuleExamples(java.lang.String toFilePath)
Exports examples of matching pairs for each variation rule.
|
TermSuitePipeline |
haeJsonExporter(java.lang.String toFilePath) |
TermSuitePipeline |
haeLogOverlappingRules() |
TermSuitePipeline |
haeScorifier()
Compute variant scores recursively.
|
TermSuitePipeline |
haeSpotterTSVWriter(java.lang.String toDirectoryPath)
Export all CAS in TSV format to a given directory.
|
TermSuitePipeline |
haeTbxExporter(java.lang.String toFilePath) |
TermSuitePipeline |
haeTraceTimePerf(java.lang.String toFile)
Exports time progress to TSV file.
|
TermSuitePipeline |
haeTsvExporter(java.lang.String toFilePath)
Exports the
TermIndex in tsv format |
TermSuitePipeline |
haeVariantEvalExporter(java.lang.String toFilePath,
int topN,
int maxVariantsPerTerm)
Creates a tsv output with :
- the occurrence list of each term and theirs in-text contexts
|
TermSuitePipeline |
haeXmiCasExporter(java.lang.String toDirectoryPath)
Exports all CAS as XMI files to a given directory.
|
org.apache.uima.resource.ExternalResourceDescription |
resScoredModel() |
org.apache.uima.resource.ExternalResourceDescription |
resTermIndex() |
TermSuitePipeline |
run()
Runs the pipeline with
SimplePipeline on the CollectionReader that must have been defined. |
TermSuitePipeline |
run(org.apache.uima.jcas.JCas cas)
Runs the pipeline with
SimplePipeline without requiring a CollectionReader
to be defined. |
TermSuitePipeline |
setCollection(TermSuiteCollection termSuiteCollection,
java.lang.String collectionPath,
java.lang.String collectionEncoding)
Creates a collection reader for this pipeline.
|
TermSuitePipeline |
setCollection(TermSuiteCollection termSuiteCollection,
java.lang.String collectionPath,
java.lang.String collectionEncoding,
java.lang.String droppedTags,
java.lang.String txtTags)
Creates a collection reader of type
GenericXMLToTxtCollectionReader for this pipeline. |
TermSuitePipeline |
setCompostCoeffs(float alpha,
float beta,
float gamma,
float delta) |
TermSuitePipeline |
setCompostMaxComponentNum(int compostMaxComponentNum) |
TermSuitePipeline |
setCompostMinComponentSize(int compostMinComponentSize) |
TermSuitePipeline |
setCompostScoreThreshold(float compostScoreThreshold) |
TermSuitePipeline |
setCompostSegmentSimilarityThreshold(java.lang.Object compostSegmentSimilarityThreshold) |
TermSuitePipeline |
setContextAssocRateMeasure(java.lang.String contextAssocRateMeasure) |
TermSuitePipeline |
setContextualizeCoTermsType(OccurrenceType contextualizeCoTermsType) |
TermSuitePipeline |
setContextualizeWithCoOccurrenceFrequencyThreshhold(int contextualizeWithCoOccurrenceFrequencyThreshhold) |
TermSuitePipeline |
setContextualizeWithTermClasses(boolean contextualizeWithTermClasses) |
TermSuitePipeline |
setExportFilteringRule(java.lang.String exportFilteringRule) |
TermSuitePipeline |
setExportFilteringThreshold(float exportFilteringThreshold) |
TermSuitePipeline |
setExportJsonWithContext(boolean b) |
TermSuitePipeline |
setExportJsonWithOccurrences(boolean exportJsonWithOccurrences) |
TermSuitePipeline |
setGraphicalVariantSimilarityThreshold(float th) |
TermSuitePipeline |
setInlineString(java.lang.String text) |
TermSuitePipeline |
setKeepVariantsWhileCleaning(boolean keepVariantsWhileCleaning) |
TermSuitePipeline |
setMateModelPath(java.lang.String path) |
TermSuitePipeline |
setPostProcessingStrategy(java.lang.String postProcessingStrategy)
Sets the post processing strategy for
RegexSpotter analysis engine |
TermSuitePipeline |
setResourcePath(java.lang.String resourcePath) |
TermSuitePipeline |
setSpotWithOccurrences(boolean b) |
TermSuitePipeline |
setSyntacticRegexesFilePath(java.lang.String syntacticRegexesFilePath) |
TermSuitePipeline |
setTermIndex(TermIndex termIndex)
Sets the term index on which this pipeline will run.
|
TermSuitePipeline |
setTreeTaggerHome(java.lang.String treeTaggerPath) |
TermSuitePipeline |
setTsvExportProperties(TermProperty... properties)
Defines the term properties that appear in tsv export file
|
TermSuitePipeline |
setTsvShowHeaders(boolean tsvWithHeaders)
Configures tsvExporter to (not) show headers on the
first line.
|
TermSuitePipeline |
setTsvShowScores(boolean tsvWithVariantScores)
Configures tsvExporter to (not) show variant scores with the
"V" label
|
TermSuitePipeline |
setYamlVariantRulesFilePath(java.lang.String yamlVariantRulesFilePath) |
public static TermSuitePipeline create(java.lang.String lang)
public static TermSuitePipeline create(TermIndex termIndex)
public TermSuitePipeline run()
SimplePipeline
on the CollectionReader
that must have been defined.TermSuitePipelineException
- if no CollectionReader
has been declared on this pipelinepublic TermSuitePipeline run(org.apache.uima.jcas.JCas cas)
SimplePipeline
without requiring a CollectionReader
to be defined.cas
- the JCas
on which the pipeline operates.public TermSuitePipeline setInlineString(java.lang.String text)
public TermSuitePipeline setCollection(TermSuiteCollection termSuiteCollection, java.lang.String collectionPath, java.lang.String collectionEncoding)
termSuiteCollection
- collectionPath
- collectionEncoding
- public TermSuitePipeline setCollection(TermSuiteCollection termSuiteCollection, java.lang.String collectionPath, java.lang.String collectionEncoding, java.lang.String droppedTags, java.lang.String txtTags)
GenericXMLToTxtCollectionReader
for this pipeline.
Requires a list of dropped tags and txt tags for collection parsing.termSuiteCollection
- collectionPath
- collectionEncoding
- droppedTags
- txtTags
- AbstractToTxtSaxHandler
public TermSuitePipeline setResourcePath(java.lang.String resourcePath)
public TermSuitePipeline setContextAssocRateMeasure(java.lang.String contextAssocRateMeasure)
public TermSuitePipeline emptyCollection()
public org.apache.uima.analysis_engine.AnalysisEngineDescription createDescription()
public TermSuitePipeline aeWordTokenizer()
public TermSuitePipeline aeTreeTagger()
public TermSuitePipeline setMateModelPath(java.lang.String path)
public TermSuitePipeline aeMateTaggerLemmatizer()
public TermSuitePipeline setTsvExportProperties(TermProperty... properties)
properties
- haeTsvExporter(String)
public TermSuitePipeline haeTsvExporter(java.lang.String toFilePath)
TermIndex
in tsv formattoFilePath
- setTsvExportProperties(TermProperty...)
public TermSuitePipeline haeScorifier()
public TermSuitePipeline haeExportVariationRuleExamples(java.lang.String toFilePath)
toFilePath
- the file path where to write the examples for each variation rulespublic TermSuitePipeline haeCompoundExporter(java.lang.String toFilePath)
toFilePath
- public TermSuitePipeline haeTbxExporter(java.lang.String toFilePath)
public TermSuitePipeline haeEvalExporter(java.lang.String toFilePath, boolean withVariants)
public TermSuitePipeline setExportJsonWithOccurrences(boolean exportJsonWithOccurrences)
public TermSuitePipeline setExportJsonWithContext(boolean b)
public TermSuitePipeline haeJsonExporter(java.lang.String toFilePath)
public TermSuitePipeline haeVariantEvalExporter(java.lang.String toFilePath, int topN, int maxVariantsPerTerm)
toFilePath
- The output file pathtopN
- The number of variants to keep in the filemaxVariantsPerTerm
- The maximum number of variants to eval for each termpublic TermSuitePipeline aeStemmer()
public TermSuitePipeline aeRegexSpotter()
public TermSuitePipeline aeCompoundSplitter()
aeCompostSplitter()
insteadpublic TermSuitePipeline aeNeoClassicalSplitter()
aeCompostSplitter()
insteadpublic TermSuitePipeline aePrefixSplitter()
aeCompostSplitter()
insteadpublic TermSuitePipeline aeStopWordsFilter()
TermIndexBlacklistWordFilterAE
public TermSuitePipeline haeXmiCasExporter(java.lang.String toDirectoryPath)
toDirectoryPath
- public TermSuitePipeline haeSpotterTSVWriter(java.lang.String toDirectoryPath)
toDirectoryPath
- SpotterTSVWriter
public TermSuitePipeline aeChineseTokenizer()
ChineseSegmenter
public org.apache.uima.resource.ExternalResourceDescription resScoredModel()
public org.apache.uima.resource.ExternalResourceDescription resTermIndex()
public TermIndex getTermIndex()
public TermSuitePipeline setTermIndex(TermIndex termIndex)
termIndex
- public TermSuitePipeline emptyTermIndex(java.lang.String name)
TermIndex
on which this
piepline with run.name
- the name of the new term indexTermSuitePipeline
objectpublic TermSuitePipeline aeSpecificityComputer()
TermProperty.WR
values (and additional
term properties of type TermProperty
in the future).TermSpecificityComputer
,
TermProperty
public TermSuitePipeline setContextualizeCoTermsType(OccurrenceType contextualizeCoTermsType)
public TermSuitePipeline setContextualizeWithTermClasses(boolean contextualizeWithTermClasses)
public TermSuitePipeline setContextualizeWithCoOccurrenceFrequencyThreshhold(int contextualizeWithCoOccurrenceFrequencyThreshhold)
public TermSuitePipeline aeContextualizer(int scope, boolean allTerms)
Contextualizer
vector of all
single-word terms in the term index.scope
- allTerms
- Contextualizer
public TermSuitePipeline aeMaxSizeThresholdCleaner(TermProperty property, int maxSize)
public TermSuitePipeline aeThresholdCleaner(TermProperty property, float threshold, boolean isPeriodic, int cleaningPeriod, int termIndexSizeTrigger)
public TermSuitePipeline aePrimaryOccurrenceDetector(int detectionStrategy)
public TermSuitePipeline aeThresholdCleanerPeriodic(TermProperty property, float threshold, int cleaningPeriod)
property
- threshold
- cleaningPeriod
- public TermSuitePipeline aeThresholdCleanerSizeTrigger(TermProperty property, float threshold, int termIndexSizeTrigger)
public TermSuitePipeline setKeepVariantsWhileCleaning(boolean keepVariantsWhileCleaning)
public TermSuitePipeline aeThresholdCleaner(TermProperty property, float threshold)
public TermSuitePipeline aeTopNCleaner(TermProperty property, int n)
public TermSuitePipeline aeTopNCleanerPeriodic(TermProperty property, int n, boolean isPeriodic, int cleaningPeriod)
property
- n
- isPeriodic
- cleaningPeriod
- public TermSuitePipeline setGraphicalVariantSimilarityThreshold(float th)
public TermSuitePipeline aeGraphicalVariantGatherer()
public TermSuitePipeline aeUrlFilter()
public TermSuitePipeline aeSyntacticVariantGatherer()
public TermSuitePipeline aeExtensionDetector()
public TermSuitePipeline setExportFilteringRule(java.lang.String exportFilteringRule)
public TermSuitePipeline setExportFilteringThreshold(float exportFilteringThreshold)
public TermSuitePipeline setTreeTaggerHome(java.lang.String treeTaggerPath)
public TermSuitePipeline setSyntacticRegexesFilePath(java.lang.String syntacticRegexesFilePath)
public TermSuitePipeline haeLogOverlappingRules()
public TermSuitePipeline enableSyntacticLabels()
public TermSuitePipeline setYamlVariantRulesFilePath(java.lang.String yamlVariantRulesFilePath)
public TermSuitePipeline setCompostCoeffs(float alpha, float beta, float gamma, float delta)
public TermSuitePipeline setCompostMaxComponentNum(int compostMaxComponentNum)
public TermSuitePipeline setCompostMinComponentSize(int compostMinComponentSize)
public TermSuitePipeline setCompostScoreThreshold(float compostScoreThreshold)
public TermSuitePipeline setCompostSegmentSimilarityThreshold(java.lang.Object compostSegmentSimilarityThreshold)
public TermSuitePipeline aeCompostSplitter()
public TermSuitePipeline haeCasStatCounter(java.lang.String statName)
public TermSuitePipeline haeTraceTimePerf(java.lang.String toFile)
WordAnnotation
processedtoFile
- public TermSuitePipeline aeTermClassifier(TermProperty sortingProperty)
sortingProperty
- the term property used to order terms before they are classified.
The first term of a class appearing given this order will be considered
as the head of the class.TermClassifier
public TermSuitePipeline haeEval(java.lang.String refFileURI, java.lang.String outputFile, java.lang.String customLogHeader, java.lang.String rFile, java.lang.String evalTraceName, boolean rtlWithVariants)
refFileURI
- The path to reference terminooutputFile
- The path to output log filecustomLogHeader
- A custom string to add in the header of the output log filerFile
- The path to output r fileevalTraceName
- The name of the eval tracertlWithVariants
- true if variants of the reference termino should be kept during the evalpublic TermSuitePipeline setSpotWithOccurrences(boolean b)
public TermSuitePipeline setPostProcessingStrategy(java.lang.String postProcessingStrategy)
RegexSpotter
analysis enginepostProcessingStrategy
- aeRegexSpotter()
,
OccurrenceBuffer.NO_CLEANING
,
OccurrenceBuffer.KEEP_PREFIXES
,
OccurrenceBuffer.KEEP_SUFFIXES
public TermSuitePipeline setTsvShowHeaders(boolean tsvWithHeaders)
tsvWithHeaders
- the flagpublic TermSuitePipeline setTsvShowScores(boolean tsvWithVariantScores)
tsvWithVariantScores
- the flag