Configuration Cheat Sheet

Configuration Cheat Sheet#

Below is a list of all configurations for the core modules in Auto Archiver

Configuration File#

# Module configuration


# Antibot Extractor/Enricher configuration options
antibot_extractor_enricher:
  save_to_pdf: false  # save a PDF snapshot of the page.
  max_download_images: 50 # maximum number of images to download from the page (0 = no download, inf = no limit).
  max_download_videos: 50 # maximum number of videos to download from the page (0 = no download, inf = no limit).
  user_data_dir: secrets/antibot_user_data # Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. If you use the docker deployment, this path will be appended with `_docker` that is because the folder cannot be shared between the host and the container due to user permissions.
  detect_auth_wall: true # detect if the page is behind an authentication wall (e.g. login required) and skip it. disable if you want to archive pages where logins are required.
  proxy: # proxy to use for the webdriver, Format: 'SERVER:PORT' or 'USER:PASS@SERVER:PORT'


# Command Line Feeder configuration options
cli_feeder:
  urls:  # URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml


# CSV Database configuration options
csv_db:
  csv_file: db.csv  # CSV file name to save metadata to


# Generic Extractor configuration options
generic_extractor:
  subtitles: true  # download subtitles if available
  comments: false # download all comments if available, may lead to large metadata
  livestreams: false # if set, will download live streams, otherwise will skip them; see --max-filesize for more control
  live_from_start: false # if set, will download live streams from their earliest available moment, otherwise starts now.
  proxy: '' # http/https/socks proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port
  proxy_on_failure_only: true # Applies only if a proxy is set. In that case if this setting is True, the extractor will only use the proxy if the initial request fails; if it is False, the extractor will always use the proxy.
  end_means_success: true # if True, any archived content will mean a 'success', if False this extractor will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent extractors can retrieve.
  allow_playlist: false # If True will also download playlists, set to False if the expectation is to download a single video.
  max_downloads: inf # Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.
  bguils_po_token_method: auto # Set up a Proof of origin token provider. This process has additional requirements. See [authentication](https://auto-archiver.readthedocs.io/en/latest/how_to/authentication_how_to.html) for more information.
  extractor_args: {} # Additional arguments to pass to the yt-dlp extractor. See https://github.com/yt-dlp/yt-dlp/blob/master/README.md#extractor-arguments.
  ytdlp_update_interval: 5 # How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.
  ytdlp_args: '' # Additional arguments to pass to yt-dlp, e.g. --no-check-certificate or --plugin-dirs.See yt-dlp documentation here for more information: https://github.com/yt-dlp/yt-dlp?tab=readme-ov-file#general-optionsNote: this is not to be confused with 'extractor_args' which are specific to the extractor itself.


# Hash Enricher configuration options
hash_enricher:
  algorithm: SHA-256  # hash algorithm to use
  chunksize: 16000000 # number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB


# HTML Formatter configuration options
html_formatter:
  detect_thumbnails: true  # if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'


# Local Storage configuration options
local_storage:
  path_generator: flat  # how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.
  filename_generator: static # how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled)
  save_to: ./local_archive # folder where to save archived content
  save_absolute: false # whether the path to the stored file is absolute or relative in the output result inc. formatters (Warning: saving an absolute path will show your computer's file structure)


# SSL Certificate Enricher configuration options
ssl_enricher:
  skip_when_nothing_archived: true  # if true, will skip enriching when no media is archived


# Thumbnail Enricher configuration options
thumbnail_enricher:
  thumbnails_per_minute: 60  # how many thumbnails to generate per minute of video, can be limited by max_thumbnails
  max_thumbnails: 16 # limit the number of thumbnails to generate per video, 0 means no limit


# Auto Archiver API Database configuration options
api_db:
  api_endpoint: ''  # API endpoint where calls are made to
  api_token: # API Bearer token.
  public: false # whether the URL should be publicly available via the API
  author_id: # which email to assign as author
  group_id: # which group of users have access to the archive in case public=false as author
  use_api_cache: false # if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived
  store_results: true # when set, will send the results to the API database.
  tags: [] # what tags to add to the archived URL


# Atlos Feeder Database Storage configuration options
atlos_feeder_db_storage:
  api_token: ''  # An Atlos API token. For more information, see https://docs.atlos.org/technical/api/
  atlos_url: https://platform.atlos.org # The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.


# CSV Feeder configuration options
csv_feeder:
  files:  # Path to the input file(s) to read the URLs from, comma separated.                         Input files should be formatted with one URL per line
  column: # Column number or name to read the URLs from, 0-indexed


# Google Drive Storage configuration options
gdrive_storage:
  path_generator: url  # how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.
  filename_generator: static # how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).
  root_folder_id: '' # root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'
  oauth_token: # JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account.
  service_account: secrets/service_account.json # service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account.


# Google Sheets Feeder Database configuration options
gsheet_feeder_db:
  sheet:  # name of the sheet to archive
  sheet_id: # the id of the sheet to archive (alternative to 'sheet' config)
  header: 1 # index of the header row (starts at 1)
  service_account: secrets/service_account.json # service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html
  columns: # Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting
    url: link
    status: archive status
    folder: destination folder
    archive: archive location
    date: archive date
    thumbnail: thumbnail
    timestamp: upload timestamp
    title: upload title
    text: text content
    screenshot: screenshot
    hash: hash
    pdq_hash: perceptual hashes
    wacz: wacz
    replaywebpage: replaywebpage
  allow_worksheets: !!set {} # A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed
  block_worksheets: !!set {} # A list of worksheet names for worksheets that should be explicitly blocked from being processed
  use_sheet_names_in_stored_paths: true # if True the stored files path will include 'workbook_name/worksheet_name/...'


# Instagram API Extractor configuration options
instagram_api_extractor:
  access_token:  # a valid instagrapi-api token
  api_endpoint: '' # API endpoint to use
  full_profile: false # if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.
  full_profile_max_posts: 0 # Use to limit the number of posts to download when full_profile is true or when a URL for multiple posts is passed (like /stories /highlights ...). 0 means no limit. when full_profile is true the order of downloaded content is stories -> posts -> tagged posts -> highlights, so a value of 10 could download 2 stories, 7 posts, 1 tagged posts, and 0 highlights.
  minimize_json_output: true # if true, will remove empty values from the json output


# Instagram Extractor configuration options
instagram_extractor:
  username: ''  # A valid Instagram username.
  password: '' # The corresponding Instagram account password.
  download_folder: instaloader # Name of a folder to temporarily download content to.
  session_file: secrets/instaloader.session # Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one.


# Instagram Telegram Bot Extractor configuration options
instagram_tbot_extractor:
  api_id:  # telegram API_ID value, go to https://my.telegram.org/apps
  api_hash: # telegram API_HASH value, go to https://my.telegram.org/apps
  session_file: secrets/anon-insta # optional, records the telegram login session for future usage, '.session' will be appended to the provided value.
  timeout: 45 # timeout to fetch the instagram content in seconds.


# Media Metadata Enricher configuration options
metadata_enricher:
  look_for_keys: []  # list of lowercased metadata keys that will be included in the enriched metadata. Special keys: 'author', 'datetimes', 'location' to include related metadata fields. The default empty list `[]` means all metadata will be included.


# OpenTimestamps Enricher configuration options
opentimestamps_enricher:
  calendar_urls:  # List of OpenTimestamps calendar servers to use for timestamping. See here for a list of calendars maintained by opentimestamps:https://opentimestamps.org/#calendars
  - https://alice.btc.calendar.opentimestamps.org
  - https://bob.btc.calendar.opentimestamps.org
  - https://finney.calendar.eternitywall.com
  calendar_whitelist: [] # Optional whitelist of calendar servers. Override this if you are using your own calendar servers. e.g. ['https://mycalendar.com']


# S3 Storage configuration options
s3_storage:
  path_generator: flat  # how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.
  filename_generator: static # how to name stored files: 'random' creates a random string; 'static' uses a hash, with the settings of the 'hash_enricher' module (defaults to SHA256 if not enabled).
  bucket: # S3 bucket name
  region: # S3 region name
  key: # S3 API key
  secret: # S3 API secret
  random_no_duplicate: false # if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`
  endpoint_url: https://{region}.digitaloceanspaces.com # S3 bucket endpoint, {region} are inserted at runtime
  cdn_url: https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key} # S3 CDN url, {bucket}, {region} and {key} are inserted at runtime
  private: false # if true S3 files will not be readable online


# Telethon Extractor configuration options
telethon_extractor:
  api_id:  # telegram API_ID value, go to https://my.telegram.org/apps
  api_hash: # telegram API_HASH value, go to https://my.telegram.org/apps
  bot_token: # optional, but allows access to more content such as large videos, talk to @botfather
  session_file: secrets/anon # Path of the file to save the telegram login session for future usage, '.session' will be appended to the provided path.
  join_channels: true # disables the initial setup with channel_invites config, useful if you have a lot and get stuck
  channel_invites: {} # (JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup


# Timestamping Enricher configuration options
timestamping_enricher:
  tsa_urls:  # List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.
  - http://timestamp.identrust.com
  - http://timestamp.ssl.trustwave.com
  - http://zeitstempel.dfn.de
  - http://ts.ssl.com
  - http://tsa.lex-persona.com/tsa
  - http://tss.cnbs.gob.hn/TSS/HttpTspServer
  cert_authorities: # Path to a file containing trusted Certificate Authorities (CAs) in PEM format. If empty, the default system authorities are used.
  allow_selfsigned: false # Whether or not to allow and save self-signed Timestamping certificates. This allows for a greater range of timestamping servers to be used, but they are not trusted authorities


# Twitter API Extractor configuration options
twitter_api_extractor:
  bearer_token:  # [deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret
  bearer_tokens: [] #  a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line
  consumer_key: # twitter API consumer_key
  consumer_secret: # twitter API consumer_secret
  access_token: # twitter API access_token
  access_secret: # twitter API access_secret


# WACZ Enricher (and Extractor) configuration options
wacz_extractor_enricher:
  profile:  # browsertrix-profile (for profile generation see https://crawler.docs.browsertrix.com/user-guide/browser-profiles/).
  docker_commands: # if a custom docker invocation is needed
  timeout: 120 # timeout for WACZ generation in seconds
  extract_media: false # If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched.
  extract_screenshot: true # If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched.
  socks_proxy_host: # SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host
  socks_proxy_port: # SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234
  proxy_server: # SOCKS server proxy URL, in development


# Wayback Machine Enricher (and Extractor) configuration options
wayback_extractor_enricher:
  timeout: 15  # seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.
  if_not_archived_within: # only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA
  key: '' # wayback API key. to get credentials visit https://archive.org/account/s3.php
  secret: '' # wayback API secret. to get credentials visit https://archive.org/account/s3.php
  proxy_http: # http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port
  proxy_https: # https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port


# Whisper Enricher configuration options
whisper_enricher:
  api_endpoint: ''  # WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe.
  api_key: '' # WhisperApi api key for authentication
  include_srt: false # Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players).
  timeout: 90 # How many seconds to wait at most for a successful job completion.
  action: translate # which Whisper operation to execute

Command Line#

Configuration Options#

Option

Description

Default

Type

antibot_extractor_enricher.save_to_pdf

Optional. save a PDF snapshot of the page.

False

bool

antibot_extractor_enricher.max_download_images

Optional. maximum number of images to download from the page (0 = no download, inf = no limit).

50

string

antibot_extractor_enricher.max_download_videos

Optional. maximum number of videos to download from the page (0 = no download, inf = no limit).

50

string

antibot_extractor_enricher.user_data_dir

Optional. Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. If you use the docker deployment, this path will be appended with _docker that is because the folder cannot be shared between the host and the container due to user permissions.

secrets/antibot_user_data

string

antibot_extractor_enricher.detect_auth_wall

Optional. detect if the page is behind an authentication wall (e.g. login required) and skip it. disable if you want to archive pages where logins are required.

True

bool

antibot_extractor_enricher.proxy

Optional. proxy to use for the webdriver, Format: ‘SERVER:PORT’ or ‘USER:PASS@SERVER:PORT’

None

string

cli_feeder.urls

Optional. URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml

None

string

csv_db.csv_file

Optional. CSV file name to save metadata to

db.csv

string

generic_extractor.subtitles

Optional. download subtitles if available

True

bool

generic_extractor.comments

Optional. download all comments if available, may lead to large metadata

False

bool

generic_extractor.livestreams

Optional. if set, will download live streams, otherwise will skip them; see –max-filesize for more control

False

bool

generic_extractor.live_from_start

Optional. if set, will download live streams from their earliest available moment, otherwise starts now.

False

bool

generic_extractor.proxy

Optional. http/https/socks proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port

string

generic_extractor.proxy_on_failure_only

Optional. Applies only if a proxy is set. In that case if this setting is True, the extractor will only use the proxy if the initial request fails; if it is False, the extractor will always use the proxy.

True

string

generic_extractor.end_means_success

Optional. if True, any archived content will mean a ‘success’, if False this extractor will not return a ‘success’ stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent extractors can retrieve.

True

bool

generic_extractor.allow_playlist

Optional. If True will also download playlists, set to False if the expectation is to download a single video.

False

bool

generic_extractor.max_downloads

Optional. Use to limit the number of videos to download when a channel or long page is being extracted. ‘inf’ means no limit.

inf

string

generic_extractor.bguils_po_token_method

Optional. Set up a Proof of origin token provider. This process has additional requirements. See authentication for more information.

auto

string

generic_extractor.extractor_args

Optional. Additional arguments to pass to the yt-dlp extractor. See yt-dlp/yt-dlp.

{}

json_loader

generic_extractor.ytdlp_update_interval

Optional. How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.

5

int

generic_extractor.ytdlp_args

Optional. Additional arguments to pass to yt-dlp, e.g. –no-check-certificate or –plugin-dirs.See yt-dlp documentation here for more information: yt-dlp/yt-dlp this is not to be confused with ‘extractor_args’ which are specific to the extractor itself.

string

hash_enricher.algorithm

Optional. hash algorithm to use

SHA-256

string

hash_enricher.chunksize

Optional. number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB

16000000

int

html_formatter.detect_thumbnails

Optional. if true will group by thumbnails generated by thumbnail enricher by id ‘thumbnail_00’

True

bool

local_storage.path_generator

Optional. how to store the file in terms of directory structure: ‘flat’ sets to root; ‘url’ creates a directory based on the provided URL; ‘random’ creates a random directory.

flat

string

local_storage.filename_generator

Optional. how to name stored files: ‘random’ creates a random string; ‘static’ uses a hash, with the settings of the ‘hash_enricher’ module (defaults to SHA256 if not enabled)

static

string

local_storage.save_to

Optional. folder where to save archived content

./local_archive

string

local_storage.save_absolute

Optional. whether the path to the stored file is absolute or relative in the output result inc. formatters (Warning: saving an absolute path will show your computer’s file structure)

False

bool

ssl_enricher.skip_when_nothing_archived

Optional. if true, will skip enriching when no media is archived

True

bool

thumbnail_enricher.thumbnails_per_minute

Optional. how many thumbnails to generate per minute of video, can be limited by max_thumbnails

60

int

thumbnail_enricher.max_thumbnails

Optional. limit the number of thumbnails to generate per video, 0 means no limit

16

int

api_db.api_endpoint

Required. API endpoint where calls are made to

string

api_db.api_token

Optional. API Bearer token.

None

string

api_db.public

Optional. whether the URL should be publicly available via the API

False

bool

api_db.author_id

Optional. which email to assign as author

None

string

api_db.group_id

Optional. which group of users have access to the archive in case public=false as author

None

string

api_db.use_api_cache

Optional. if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived

False

bool

api_db.store_results

Optional. when set, will send the results to the API database.

True

bool

api_db.tags

Optional. what tags to add to the archived URL

[]

string

atlos_feeder_db_storage.api_token

Required. An Atlos API token. For more information, see https://docs.atlos.org/technical/api/

string

atlos_feeder_db_storage.atlos_url

Optional. The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.

https://platform.atlos.org

string

csv_feeder.files

Required. Path to the input file(s) to read the URLs from, comma separated. Input files should be formatted with one URL per line

None

valid_file

csv_feeder.column

Optional. Column number or name to read the URLs from, 0-indexed

None

string

gdrive_storage.path_generator

Optional. how to store the file in terms of directory structure: ‘flat’ sets to root; ‘url’ creates a directory based on the provided URL; ‘random’ creates a random directory.

url

string

gdrive_storage.filename_generator

Optional. how to name stored files: ‘random’ creates a random string; ‘static’ uses a hash, with the settings of the ‘hash_enricher’ module (defaults to SHA256 if not enabled).

static

string

gdrive_storage.root_folder_id

Required. root google drive folder ID to use as storage, found in URL: ‘https://drive.google.com/drive/folders/FOLDER_ID

string

gdrive_storage.oauth_token

Optional. JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account.

None

string

gdrive_storage.service_account

Optional. service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account.

secrets/service_account.json

string

gsheet_feeder_db.sheet

Optional. name of the sheet to archive

None

string

gsheet_feeder_db.sheet_id

Optional. the id of the sheet to archive (alternative to ‘sheet’ config)

None

string

gsheet_feeder_db.header

Optional. index of the header row (starts at 1)

1

int

gsheet_feeder_db.service_account

Required. service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html

secrets/service_account.json

string

gsheet_feeder_db.columns

Optional. Custom names for the columns in your Google sheet. If you don’t want to use the default column names, change them with this setting

{‘url’: ‘link’, ‘status’: ‘archive status’, ‘folder’: ‘destination folder’, ‘archive’: ‘archive location’, ‘date’: ‘archive date’, ‘thumbnail’: ‘thumbnail’, ‘timestamp’: ‘upload timestamp’, ‘title’: ‘upload title’, ‘text’: ‘text content’, ‘screenshot’: ‘screenshot’, ‘hash’: ‘hash’, ‘pdq_hash’: ‘perceptual hashes’, ‘wacz’: ‘wacz’, ‘replaywebpage’: ‘replaywebpage’}

json_loader

gsheet_feeder_db.allow_worksheets

Optional. A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed

set()

string

gsheet_feeder_db.block_worksheets

Optional. A list of worksheet names for worksheets that should be explicitly blocked from being processed

set()

string

gsheet_feeder_db.use_sheet_names_in_stored_paths

Optional. if True the stored files path will include ‘workbook_name/worksheet_name/…’

True

bool

instagram_api_extractor.access_token

Optional. a valid instagrapi-api token

None

string

instagram_api_extractor.api_endpoint

Required. API endpoint to use

string

instagram_api_extractor.full_profile

Optional. if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.

False

bool

instagram_api_extractor.full_profile_max_posts

Optional. Use to limit the number of posts to download when full_profile is true or when a URL for multiple posts is passed (like /stories /highlights …). 0 means no limit. when full_profile is true the order of downloaded content is stories -> posts -> tagged posts -> highlights, so a value of 10 could download 2 stories, 7 posts, 1 tagged posts, and 0 highlights.

0

int

instagram_api_extractor.minimize_json_output

Optional. if true, will remove empty values from the json output

True

bool

instagram_extractor.username

Required. A valid Instagram username.

string

instagram_extractor.password

Required. The corresponding Instagram account password.

string

instagram_extractor.download_folder

Optional. Name of a folder to temporarily download content to.

instaloader

string

instagram_extractor.session_file

Optional. Path to the instagram session file which saves session credentials. If one doesn’t exist this gives the path to store a new one.

secrets/instaloader.session

string

instagram_tbot_extractor.api_id

Optional. telegram API_ID value, go to https://my.telegram.org/apps

None

string

instagram_tbot_extractor.api_hash

Optional. telegram API_HASH value, go to https://my.telegram.org/apps

None

string

instagram_tbot_extractor.session_file

Optional. optional, records the telegram login session for future usage, ‘.session’ will be appended to the provided value.

secrets/anon-insta

string

instagram_tbot_extractor.timeout

Optional. timeout to fetch the instagram content in seconds.

45

int

metadata_enricher.look_for_keys

Optional. list of lowercased metadata keys that will be included in the enriched metadata. Special keys: ‘author’, ‘datetimes’, ‘location’ to include related metadata fields. The default empty list [] means all metadata will be included.

[]

list

opentimestamps_enricher.calendar_urls

Optional. List of OpenTimestamps calendar servers to use for timestamping. See here for a list of calendars maintained by opentimestamps:https://opentimestamps.org/#calendars

[‘https://alice.btc.calendar.opentimestamps.org’, ‘https://bob.btc.calendar.opentimestamps.org’, ‘https://finney.calendar.eternitywall.com’]

list

opentimestamps_enricher.calendar_whitelist

Optional. Optional whitelist of calendar servers. Override this if you are using your own calendar servers. e.g. [‘https://mycalendar.com’]

[]

list

s3_storage.path_generator

Optional. how to store the file in terms of directory structure: ‘flat’ sets to root; ‘url’ creates a directory based on the provided URL; ‘random’ creates a random directory.

flat

string

s3_storage.filename_generator

Optional. how to name stored files: ‘random’ creates a random string; ‘static’ uses a hash, with the settings of the ‘hash_enricher’ module (defaults to SHA256 if not enabled).

static

string

s3_storage.bucket

Optional. S3 bucket name

None

string

s3_storage.region

Optional. S3 region name

None

string

s3_storage.key

Optional. S3 API key

None

string

s3_storage.secret

Optional. S3 API secret

None

string

s3_storage.random_no_duplicate

Optional. if set, it will override path_generator, filename_generator and folder. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path no-dups/

False

bool

s3_storage.endpoint_url

Optional. S3 bucket endpoint, {region} are inserted at runtime

https://{region}.digitaloceanspaces.com

string

s3_storage.cdn_url

Optional. S3 CDN url, {bucket}, {region} and {key} are inserted at runtime

https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}

string

s3_storage.private

Optional. if true S3 files will not be readable online

False

bool

telethon_extractor.api_id

Optional. telegram API_ID value, go to https://my.telegram.org/apps

None

string

telethon_extractor.api_hash

Optional. telegram API_HASH value, go to https://my.telegram.org/apps

None

string

telethon_extractor.bot_token

Optional. optional, but allows access to more content such as large videos, talk to @botfather

None

string

telethon_extractor.session_file

Optional. Path of the file to save the telegram login session for future usage, ‘.session’ will be appended to the provided path.

secrets/anon

string

telethon_extractor.join_channels

Optional. disables the initial setup with channel_invites config, useful if you have a lot and get stuck

True

bool

telethon_extractor.channel_invites

Optional. (JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup

{}

json_loader

timestamping_enricher.tsa_urls

Optional. List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.

[‘http://timestamp.identrust.com’, ‘http://timestamp.ssl.trustwave.com’, ‘http://zeitstempel.dfn.de’, ‘http://ts.ssl.com’, ‘http://tsa.lex-persona.com/tsa’, ‘http://tss.cnbs.gob.hn/TSS/HttpTspServer’]

string

timestamping_enricher.cert_authorities

Optional. Path to a file containing trusted Certificate Authorities (CAs) in PEM format. If empty, the default system authorities are used.

None

string

timestamping_enricher.allow_selfsigned

Optional. Whether or not to allow and save self-signed Timestamping certificates. This allows for a greater range of timestamping servers to be used, but they are not trusted authorities

False

bool

twitter_api_extractor.bearer_token

Optional. [deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret

None

string

twitter_api_extractor.bearer_tokens

Optional. a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line

[]

string

twitter_api_extractor.consumer_key

Optional. twitter API consumer_key

None

string

twitter_api_extractor.consumer_secret

Optional. twitter API consumer_secret

None

string

twitter_api_extractor.access_token

Optional. twitter API access_token

None

string

twitter_api_extractor.access_secret

Optional. twitter API access_secret

None

string

wacz_extractor_enricher.profile

Optional. browsertrix-profile (for profile generation see https://crawler.docs.browsertrix.com/user-guide/browser-profiles/).

None

string

wacz_extractor_enricher.docker_commands

Optional. if a custom docker invocation is needed

None

string

wacz_extractor_enricher.timeout

Optional. timeout for WACZ generation in seconds

120

int

wacz_extractor_enricher.extract_media

Optional. If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched.

False

bool

wacz_extractor_enricher.extract_screenshot

Optional. If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched.

True

bool

wacz_extractor_enricher.socks_proxy_host

Optional. SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host

None

string

wacz_extractor_enricher.socks_proxy_port

Optional. SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234

None

int

wacz_extractor_enricher.proxy_server

Optional. SOCKS server proxy URL, in development

None

string

wayback_extractor_enricher.timeout

Optional. seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.

15

int

wayback_extractor_enricher.if_not_archived_within

Optional. only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA

None

string

wayback_extractor_enricher.key

Required. wayback API key. to get credentials visit https://archive.org/account/s3.php

string

wayback_extractor_enricher.secret

Required. wayback API secret. to get credentials visit https://archive.org/account/s3.php

string

wayback_extractor_enricher.proxy_http

Optional. http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port

None

string

wayback_extractor_enricher.proxy_https

Optional. https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port

None

string

whisper_enricher.api_endpoint

Required. WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of bellingcat/whisperbox-transcribe.

string

whisper_enricher.api_key

Required. WhisperApi api key for authentication

string

whisper_enricher.include_srt

Optional. Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players).

False

bool

whisper_enricher.timeout

Optional. How many seconds to wait at most for a successful job completion.

90

int

whisper_enricher.action

Optional. which Whisper operation to execute

translate

string