protected static class WorkerThread.ProcessActivity extends Object implements IProcessActivity
_rcsidBAD_URL, EXCLUDED_CONTENT, EXCLUDED_DATE, EXCLUDED_LENGTH, EXCLUDED_MIMETYPE, EXCLUDED_URL, NULL_URL| Constructor and Description |
|---|
WorkerThread.ProcessActivity(Long jobID,
String processID,
IReprioritizationTracker rt,
IJobManager jobManager,
IIncrementalIngester ingester,
String connectionName,
IPipelineSpecification pipelineSpecification,
Map<String,QueuedDocument> previousDocuments,
long currentTime,
Long expireInterval,
Map<String,Set<String>> forcedMetadata,
Long recrawlInterval,
Long maxInterval,
int hopcountMode,
IRepositoryConnection connection,
IRepositoryConnector connector,
IRepositoryConnectionManager connMgr,
String[] legalLinkTypes,
WorkerThread.OutputActivity ingestLogger,
String parameterVersion)
Constructor.
|
| Modifier and Type | Method and Description |
|---|---|
void |
addDocumentReference(String localIdentifier)
Add a document description to the current job's queue.
|
void |
addDocumentReference(String localIdentifier,
String parentIdentifier,
String relationshipType)
Add a document description to the current job's queue.
|
void |
addDocumentReference(String localIdentifier,
String parentIdentifier,
String relationshipType,
String[] dataNames,
Object[][] dataValues)
Add a document description to the current job's queue.
|
void |
addDocumentReference(String localIdentifier,
String parentIdentifier,
String relationshipType,
String[] dataNames,
Object[][] dataValues,
Long originationTime)
Add a document description to the current job's queue.
|
void |
addDocumentReference(String localIdentifier,
String parentIdentifier,
String relationshipType,
String[] dataNames,
Object[][] dataValues,
Long originationTime,
String[] prereqEventNames)
Add a document description to the current job's queue.
|
boolean |
beginEventSequence(String eventName)
Begin an event sequence.
|
Long |
calculateDocumentExpireTime(long currentTime,
String localIdentifier) |
Long |
calculateDocumentRescheduleTime(long currentTime,
long timeAmt,
String localIdentifier) |
protected void |
checkAllComponentsMultipleDispositions(String documentIdentifier) |
boolean |
checkDateIndexable(Date date)
Detect if a date is indexable or not.
|
boolean |
checkDocumentIndexable(File localFile)
Check whether a document is indexable by the currently specified output connector.
|
boolean |
checkDocumentNeedsReindexing(String documentIdentifier,
String newVersionString)
Check if a document needs to be reindexed, based on a computed version string.
|
boolean |
checkDocumentNeedsReindexing(String documentIdentifier,
String componentIdentifier,
String newVersionString)
Check if a document needs to be reindexed, based on a computed version string.
|
void |
checkJobStillActive()
Check whether current job is still active.
|
boolean |
checkLengthIndexable(long length)
Check whether a document of a specified length is indexable by the currently specified output connector.
|
boolean |
checkMimeTypeIndexable(String mimeType)
Check whether a mime type is indexable by the currently specified output connector.
|
protected void |
checkMultipleDispositions(String documentIdentifier,
String componentIdentifier,
String componentIdentifierHash) |
boolean |
checkURLIndexable(String url)
Pre-determine whether a document's URL is indexable by this connector.
|
void |
completeEventSequence(String eventName)
Complete an event sequence.
|
protected IPipelineSpecificationWithVersions |
computePipelineSpecificationWithVersions(String documentIdentifierHash,
String componentIdentifierHash,
String documentIdentifier) |
String |
createConnectionSpecificString(String simpleString)
Create a connection-specific string from a simple string.
|
String |
createGlobalString(String simpleString)
Create a global string from a simple string.
|
String |
createJobSpecificString(String simpleString)
Create a job-based string from a simple string.
|
void |
deleteDocument(String documentIdentifier)
Delete the specified document from the search engine index, and from the status table.
|
void |
deleteDocument(String documentIdentifier,
String version)
Deprecated.
|
void |
discard()
Clean up any dangling information, before abandoning this process activity object
|
void |
flush()
Flush the outstanding references into the database.
|
Long |
getDocumentExpirationLowerBoundTime(String localIdentifier)
Find a document's lower expiration time bound, if any
|
Long |
getDocumentExpirationUpperBoundTime(String localIdentifier)
Find a document's upper expiration time bound, if any
|
Long |
getDocumentOriginationTime(String localIdentifier)
Get a document's origination time
|
Long |
getDocumentRescheduleLowerBoundTime(String localIdentifier)
Find a document's lower rescheduling time bound, if any
|
Long |
getDocumentRescheduleUpperBoundTime(String localIdentifier)
Find a document's upper rescheduling time bound, if any
|
void |
ingestDocument(String localIdentifier,
String version,
String documentURI,
RepositoryDocument data)
Deprecated.
|
void |
ingestDocumentWithException(String documentIdentifier,
String version,
String documentURI,
RepositoryDocument data)
Ingest the current document.
|
void |
ingestDocumentWithException(String documentIdentifier,
String componentIdentifier,
String version,
String documentURI,
RepositoryDocument data)
Ingest the current document.
|
void |
noDocument(String documentIdentifier,
String version)
Remove the specified document from the search engine index, while keeping track of the version information
for it (to reduce churn).
|
void |
noDocument(String documentIdentifier,
String componentIdentifier,
String version)
Remove the specified document from the search engine index, and update the
recorded version information for the document.
|
protected void |
processDocumentReferences()
Process outstanding document references, in batch.
|
void |
recordActivity(Long startTime,
String activityType,
Long dataSize,
String entityIdentifier,
String resultCode,
String resultDescription,
String[] childIdentifiers)
Record time-stamped information about the activity of the connector.
|
void |
recordDocument(String documentIdentifier,
String version)
Record a document version, but don't ingest it.
|
void |
recordDocument(String documentIdentifier,
String componentIdentifier,
String version)
Record a document version, WITHOUT reindexing it, or removing it.
|
void |
removeDocument(String documentIdentifier)
Remove the specified document primary component permanently from the search engine index,
and from the status table.
|
void |
resetTimes()
Reset the recorded times
|
void |
retainAllComponentDocument(String documentIdentifier)
Retain all existing document components of a primary document.
|
void |
retainDocument(String documentIdentifier,
String componentIdentifier)
Retain existing document component.
|
String[] |
retrieveParentData(String localIdentifier,
String dataName)
Retrieve data passed from parents to a specified child document.
|
CharacterInput[] |
retrieveParentDataAsFiles(String localIdentifier,
String dataName)
Retrieve data passed from parents to a specified child document.
|
void |
retryDocumentProcessing(String localIdentifier)
Abort processing a document (for sequencing reasons).
|
void |
setDocumentOriginationTime(String localIdentifier,
Long originationTime)
Override a document's origination time.
|
void |
setDocumentScheduleBounds(String localIdentifier,
Long lowerRecrawlBoundTime,
Long upperRecrawlBoundTime,
Long lowerExpireBoundTime,
Long upperExpireBoundTime)
Override the schedule for the next time a document is crawled.
|
protected void |
touchAllComponentsSet(String documentIdentifier) |
protected void |
touchComponentSet(String documentIdentifier,
String componentIdentifierHash) |
boolean |
wasDocumentAborted(String documentIdentifier)
Check whether a document was aborted or not.
|
boolean |
wasDocumentComponentTouched(String documentIdentifier,
String componentIdentifierHash)
Check whether a document component was touched or not.
|
boolean |
wasDocumentDeleted(String documentIdentifier)
Check whether document was deleted or not.
|
boolean |
wasDocumentTouched(String documentIdentifier)
Check whether a document (and its version string) was touched or not.
|
protected final Long jobID
protected final String processID
protected final IJobManager jobManager
protected final IIncrementalIngester ingester
protected final String connectionName
protected final IPipelineSpecification pipelineSpecification
protected final Map<String,QueuedDocument> previousDocuments
protected final long currentTime
protected final Long expireInterval
protected final Long recrawlInterval
protected final Long maxInterval
protected final int hopcountMode
protected final IRepositoryConnection connection
protected final IRepositoryConnector connector
protected final IRepositoryConnectionManager connMgr
protected final String[] legalLinkTypes
protected final WorkerThread.OutputActivity ingestLogger
protected final IReprioritizationTracker rt
protected final String parameterVersion
protected final Map<WorkerThread.DocumentReference,WorkerThread.DocumentReference> referenceList
public WorkerThread.ProcessActivity(Long jobID, String processID, IReprioritizationTracker rt, IJobManager jobManager, IIncrementalIngester ingester, String connectionName, IPipelineSpecification pipelineSpecification, Map<String,QueuedDocument> previousDocuments, long currentTime, Long expireInterval, Map<String,Set<String>> forcedMetadata, Long recrawlInterval, Long maxInterval, int hopcountMode, IRepositoryConnection connection, IRepositoryConnector connector, IRepositoryConnectionManager connMgr, String[] legalLinkTypes, WorkerThread.OutputActivity ingestLogger, String parameterVersion)
jobManager - is the job manageringester - is the ingesterpublic void discard()
throws ManifoldCFException
ManifoldCFExceptionpublic boolean wasDocumentTouched(String documentIdentifier)
public boolean wasDocumentComponentTouched(String documentIdentifier, String componentIdentifierHash)
public boolean wasDocumentDeleted(String documentIdentifier)
public boolean wasDocumentAborted(String documentIdentifier)
public boolean checkDocumentNeedsReindexing(String documentIdentifier, String newVersionString) throws ManifoldCFException
checkDocumentNeedsReindexing in interface IProcessActivitydocumentIdentifier - is the document identifier.newVersionString - is the newly-computed version string.ManifoldCFExceptionpublic boolean checkDocumentNeedsReindexing(String documentIdentifier, String componentIdentifier, String newVersionString) throws ManifoldCFException
checkDocumentNeedsReindexing in interface IProcessActivitydocumentIdentifier - is the document identifier.componentIdentifier - is the component document identifier, if any.newVersionString - is the newly-computed version string.ManifoldCFExceptionpublic void addDocumentReference(String localIdentifier, String parentIdentifier, String relationshipType, String[] dataNames, Object[][] dataValues, Long originationTime, String[] prereqEventNames) throws ManifoldCFException
addDocumentReference in interface IProcessActivitylocalIdentifier - is the local document identifier to add (for the connector that
fetched the document).parentIdentifier - is the document identifier that is considered to be the "parent"
of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.relationshipType - is the string describing the kind of relationship described by this
reference. This must be one of the strings returned by the IRepositoryConnector method
"getRelationshipTypes()". May be null.dataNames - is the list of carry-down data from the parent to the child. May be null. Each name is limited to 255 characters!dataValues - are the values that correspond to the data names in the dataNames parameter. May be null only if dataNames is null.
The type of each object must either be a String, or a CharacterInput.originationTime - is the time, in ms since epoch, that the document originated. Pass null if none or unknown.prereqEventNames - are the names of the prerequisite events which this document requires prior to processing. Pass null if none.ManifoldCFExceptionpublic void addDocumentReference(String localIdentifier, String parentIdentifier, String relationshipType, String[] dataNames, Object[][] dataValues, Long originationTime) throws ManifoldCFException
addDocumentReference in interface IProcessActivitylocalIdentifier - is the local document identifier to add (for the connector that
fetched the document).parentIdentifier - is the document identifier that is considered to be the "parent"
of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.relationshipType - is the string describing the kind of relationship described by this
reference. This must be one of the strings returned by the IRepositoryConnector method
"getRelationshipTypes()". May be null.dataNames - is the list of carry-down data from the parent to the child. May be null. Each name is limited to 255 characters!dataValues - are the values that correspond to the data names in the dataNames parameter. May be null only if dataNames is null.originationTime - is the time, in ms since epoch, that the document originated. Pass null if none or unknown.ManifoldCFExceptionpublic void addDocumentReference(String localIdentifier, String parentIdentifier, String relationshipType, String[] dataNames, Object[][] dataValues) throws ManifoldCFException
addDocumentReference in interface IProcessActivitylocalIdentifier - is the local document identifier to add (for the connector that
fetched the document).parentIdentifier - is the document identifier that is considered to be the "parent"
of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.relationshipType - is the string describing the kind of relationship described by this
reference. This must be one of the strings returned by the IRepositoryConnector method
"getRelationshipTypes()". May be null.dataNames - is the list of carry-down data from the parent to the child. May be null. Each name is limited to 255 characters!dataValues - are the values that correspond to the data names in the dataNames parameter. May be null only if dataNames is null.ManifoldCFExceptionpublic void addDocumentReference(String localIdentifier, String parentIdentifier, String relationshipType) throws ManifoldCFException
addDocumentReference in interface IProcessActivitylocalIdentifier - is the local document identifier to add (for the connector that
fetched the document).parentIdentifier - is the document identifier that is considered to be the "parent"
of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.relationshipType - is the string describing the kind of relationship described by this
reference. This must be one of the strings returned by the IRepositoryConnector method
"getRelationshipTypes()". May be null.ManifoldCFExceptionpublic void addDocumentReference(String localIdentifier) throws ManifoldCFException
addDocumentReference in interface IProcessActivitylocalIdentifier - is the local document identifier to add (for the connector that
fetched the document).ManifoldCFExceptionpublic String[] retrieveParentData(String localIdentifier, String dataName) throws ManifoldCFException
retrieveParentData in interface ICarrydownActivitylocalIdentifier - is the document identifier of the document we want the recorded data for.dataName - is the name of the data items to retrieve.ManifoldCFExceptionpublic CharacterInput[] retrieveParentDataAsFiles(String localIdentifier, String dataName) throws ManifoldCFException
retrieveParentDataAsFiles in interface ICarrydownActivitylocalIdentifier - is the document identifier of the document we want the recorded data for.dataName - is the name of the data items to retrieve.ManifoldCFExceptionpublic void recordDocument(String documentIdentifier, String version) throws ManifoldCFException
recordDocument in interface IProcessActivitydocumentIdentifier - is the document identifier.version - is the document version.ManifoldCFExceptionpublic void recordDocument(String documentIdentifier, String componentIdentifier, String version) throws ManifoldCFException
recordDocument in interface IProcessActivitydocumentIdentifier - is the document identifier.componentIdentifier - is the component document identifier, if any.version - is the document version.ManifoldCFException@Deprecated public void ingestDocument(String localIdentifier, String version, String documentURI, RepositoryDocument data) throws ManifoldCFException, ServiceInterruption
ingestDocument in interface IProcessActivitylocalIdentifier - is the document's local identifier.version - is the version of the document, as reported by the getDocumentVersions() method of the
corresponding repository connector.documentURI - is the URI to use to retrieve this document from the search interface (and is
also the unique key in the index).data - is the document data. The data is closed after ingestion is complete.
NOTE: Any data stream IOExceptions will be converted to ManifoldCFExceptions and ServiceInterruptions
according to standard best practices.ManifoldCFExceptionServiceInterruptionpublic void ingestDocumentWithException(String documentIdentifier, String version, String documentURI, RepositoryDocument data) throws ManifoldCFException, ServiceInterruption, IOException
ingestDocumentWithException in interface IProcessActivitydocumentIdentifier - is the document's local identifier.version - is the version of the document, as reported by the getDocumentVersions() method of the
corresponding repository connector.documentURI - is the URI to use to retrieve this document from the search interface (and is
also the unique key in the index).data - is the document data. The data is closed after ingestion is complete.IOException - only when data stream reading fails.ManifoldCFExceptionServiceInterruptionpublic void ingestDocumentWithException(String documentIdentifier, String componentIdentifier, String version, String documentURI, RepositoryDocument data) throws ManifoldCFException, ServiceInterruption, IOException
ingestDocumentWithException in interface IProcessActivitydocumentIdentifier - is the document's identifier.componentIdentifier - is the component document identifier, if any.version - is the version of the document, as reported by the getDocumentVersions() method of the
corresponding repository connector.documentURI - is the URI to use to retrieve this document from the search interface (and is
also the unique key in the index).data - is the document data. The data is closed after ingestion is complete.IOException - only when data stream reading fails.ManifoldCFExceptionServiceInterruptionpublic void noDocument(String documentIdentifier, String version) throws ManifoldCFException, ServiceInterruption
noDocument in interface IProcessActivitydocumentIdentifier - is the document's local identifier.version - is the version string to be recorded for the document.ManifoldCFExceptionServiceInterruptionpublic void noDocument(String documentIdentifier, String componentIdentifier, String version) throws ManifoldCFException, ServiceInterruption
noDocument in interface IProcessActivitydocumentIdentifier - is the document's local identifier.componentIdentifier - is the component document identifier, if any.version - is the version string to be recorded for the document.ManifoldCFExceptionServiceInterruptionpublic void removeDocument(String documentIdentifier) throws ManifoldCFException, ServiceInterruption
removeDocument in interface IProcessActivitydocumentIdentifier - is the document's identifier.ManifoldCFExceptionServiceInterruptionpublic void retainDocument(String documentIdentifier, String componentIdentifier) throws ManifoldCFException
retainDocument in interface IProcessActivitydocumentIdentifier - is the document's identifier.componentIdentifier - is the component document identifier, which cannot be null.ManifoldCFException@Deprecated public void deleteDocument(String documentIdentifier, String version) throws ManifoldCFException, ServiceInterruption
deleteDocument in interface IProcessActivitydocumentIdentifier - is the document's local identifier.version - is the version string to be recorded for the document.ManifoldCFExceptionServiceInterruptionpublic void retainAllComponentDocument(String documentIdentifier) throws ManifoldCFException
retainAllComponentDocument in interface IProcessActivitydocumentIdentifier - is the document's identifier.ManifoldCFExceptionpublic void deleteDocument(String documentIdentifier) throws ManifoldCFException
deleteDocument in interface IProcessActivitydocumentIdentifier - is the document's identifier.ManifoldCFExceptionpublic void setDocumentScheduleBounds(String localIdentifier, Long lowerRecrawlBoundTime, Long upperRecrawlBoundTime, Long lowerExpireBoundTime, Long upperExpireBoundTime) throws ManifoldCFException
setDocumentScheduleBounds in interface IProcessActivitylocalIdentifier - is the document's local identifier.lowerRecrawlBoundTime - is the time in ms since epoch that the reschedule time should not fall BELOW, or null if none.upperRecrawlBoundTime - is the time in ms since epoch that the reschedule time should not rise ABOVE, or null if none.lowerExpireBoundTime - is the time in ms since epoch that the expire time should not fall BELOW, or null if none.upperExpireBoundTime - is the time in ms since epoch that the expire time should not rise ABOVE, or null if none.ManifoldCFExceptionpublic void setDocumentOriginationTime(String localIdentifier, Long originationTime) throws ManifoldCFException
setDocumentOriginationTime in interface IProcessActivitylocalIdentifier - is the document's local identifier.originationTime - is the document's origination time, or null if unknown.ManifoldCFExceptionpublic Long getDocumentRescheduleLowerBoundTime(String localIdentifier)
public Long getDocumentRescheduleUpperBoundTime(String localIdentifier)
public Long getDocumentExpirationLowerBoundTime(String localIdentifier)
public Long getDocumentExpirationUpperBoundTime(String localIdentifier)
public Long getDocumentOriginationTime(String localIdentifier)
public Long calculateDocumentRescheduleTime(long currentTime, long timeAmt, String localIdentifier)
public Long calculateDocumentExpireTime(long currentTime, String localIdentifier)
public void resetTimes()
public void recordActivity(Long startTime, String activityType, Long dataSize, String entityIdentifier, String resultCode, String resultDescription, String[] childIdentifiers) throws ManifoldCFException
recordActivity in interface IHistoryActivitystartTime - is either null or the time since the start of epoch in milliseconds (Jan 1, 1970). Every
activity has an associated time; the startTime field records when the activity began. A null value
indicates that the start time and the finishing time are the same.activityType - is a string which is fully interpretable only in the context of the connector involved, which is
used to categorize what kind of activity is being recorded. For example, a web connector might record a
"fetch document" activity. Cannot be null.dataSize - is the number of bytes of data involved in the activity, or null if not applicable.entityIdentifier - is a (possibly long) string which identifies the object involved in the history record.
The interpretation of this field will differ from connector to connector. May be null.resultCode - contains a terse description of the result of the activity. The description is limited in
size to 255 characters, and can be interpreted only in the context of the current connector. May be null.resultDescription - is a (possibly long) human-readable string which adds detail, if required, to the result
described in the resultCode field. This field is not meant to be queried on. May be null.childIdentifiers - is a set of child entity identifiers associated with this activity. May be null.ManifoldCFExceptionpublic void flush()
throws ManifoldCFException
ManifoldCFExceptionprotected void processDocumentReferences()
throws ManifoldCFException
ManifoldCFExceptionpublic void checkJobStillActive()
throws ManifoldCFException,
ServiceInterruption
checkJobStillActive in interface IAbortActivityManifoldCFExceptionServiceInterruptionpublic boolean beginEventSequence(String eventName) throws ManifoldCFException
beginEventSequence in interface IEventActivityeventName - is the event name.ManifoldCFExceptionpublic void completeEventSequence(String eventName) throws ManifoldCFException
completeEventSequence in interface IEventActivityeventName - is the event name.ManifoldCFExceptionpublic void retryDocumentProcessing(String localIdentifier) throws ManifoldCFException
retryDocumentProcessing in interface IEventActivitylocalIdentifier - is the document identifier to requeueManifoldCFExceptionpublic boolean checkDateIndexable(Date date) throws ManifoldCFException, ServiceInterruption
checkDateIndexable in interface IFingerprintActivitydate - is the date of the document; may be nullManifoldCFExceptionServiceInterruptionpublic boolean checkMimeTypeIndexable(String mimeType) throws ManifoldCFException, ServiceInterruption
checkMimeTypeIndexable in interface IFingerprintActivitymimeType - is the mime type to check, not including any character set specification.ManifoldCFExceptionServiceInterruptionpublic boolean checkDocumentIndexable(File localFile) throws ManifoldCFException, ServiceInterruption
checkDocumentIndexable in interface IFingerprintActivitylocalFile - is the local copy of the file to check.ManifoldCFExceptionServiceInterruptionpublic boolean checkLengthIndexable(long length)
throws ManifoldCFException,
ServiceInterruption
checkLengthIndexable in interface IFingerprintActivitylength - is the length to check.ManifoldCFExceptionServiceInterruptionpublic boolean checkURLIndexable(String url) throws ManifoldCFException, ServiceInterruption
checkURLIndexable in interface IFingerprintActivityurl - is the URL of the document.ManifoldCFExceptionServiceInterruptionpublic String createGlobalString(String simpleString)
createGlobalString in interface INamingActivitysimpleString - is the simple string.public String createConnectionSpecificString(String simpleString)
createConnectionSpecificString in interface INamingActivitysimpleString - is the simple string.public String createJobSpecificString(String simpleString)
createJobSpecificString in interface INamingActivitysimpleString - is the simple string.protected void checkAllComponentsMultipleDispositions(String documentIdentifier)
protected void checkMultipleDispositions(String documentIdentifier, String componentIdentifier, String componentIdentifierHash)
protected void touchAllComponentsSet(String documentIdentifier)
protected void touchComponentSet(String documentIdentifier, String componentIdentifierHash)
protected IPipelineSpecificationWithVersions computePipelineSpecificationWithVersions(String documentIdentifierHash, String componentIdentifierHash, String documentIdentifier)