# Defines settings for trafilatura (https://trafilatura.readthedocs.io/en/latest/settings.html)

[DEFAULT]

# Download
DOWNLOAD_TIMEOUT = 30
MAX_FILE_SIZE = 20000000
MIN_FILE_SIZE = 10

# sleep between requests
SLEEP_TIME = 5.0

# one line per user-agent
USER_AGENTS =
#     "agent1"
#     "agent2"

# cookie for HTTP requests
COOKIE =

# maximum number of redirects that we will follow
MAX_REDIRECTS = 2


# Extraction
MIN_EXTRACTED_SIZE = 250
MIN_EXTRACTED_COMM_SIZE = 1
MIN_OUTPUT_SIZE = 1
MIN_OUTPUT_COMM_SIZE = 1


# discard documents with too many elements
MAX_TREE_SIZE = 


# CLI file processing only, set to 0 to disable
EXTRACTION_TIMEOUT = 30


# Deduplication
MIN_DUPLCHECK_SIZE = 100
MAX_REPETITIONS = 2


# Extraction option for Htmldate
EXTENSIVE_DATE_SEARCH = on


# URLs in feeds and sitemaps
EXTERNAL_URLS = off
