|
|
| |
pavuk(1) |
Internet utils |
pavuk(1) |
pavuk - HTTP, HTTP over SSL, FTP, FTP over SSL and Gopher recursive document
retrieval program
pavuk [-mode {normal | resumeregets | singlepage | singlereget | sync |
dontstore | ftpdir | mirror}] [-X] [-runX] [-bg/-nobg] [prefs/-noprefs]
[-h] [-v] [-progress/-noprogress] [-stime/-nostime] [-xmaxlog $nr]
[-logfile $file] [-slogfile $file] [-auth_file $file] [-msgcat $dir]
[-language $str] [-gui_font $font] [-quiet/-verbose
[-read_css/-noread_css] [-cdir $dir] [-scndir $dir]
[-scenario $str] [-dumpscn $filename] [-lmax $nr] [-dmax $nr]
[-leave_level $nr] [-maxsize $nr] [-minsize $nr] [-asite
$list] [-dsite $list] [-adomain $list] [-ddomain $list] [-asfx
$list] [-dsfx $list] [-aprefix $list] [-dprefix $list] [-amimt
$list] [-dmimet $list] [-pattern $pattern] [-url_pattern $pattern]
[-rpattern $regexp] [-url_rpattern $regexp] [-skip_pattern $pattern]
[-skip_url_pattern $pattern] [-skip_rpattern $regexp]
[-skip_url_rpattern $regexp] [-newer_than $time] [-older_than
$time] [-schedule $time] [-reschedule $nr]
[-dont_leave_site/-leave_site] [-dont_leave_dir/-leave_dir]
[-http_proxy $site[:$port]] [-ftp_proxy $site[:$port]] [-ssl_proxy
$site[:$port]] [-gopher_proxy $site[:$port]]
[-ftp_httpgw/-noftp_httpgw] [-ftp_dirtyproxy/-noftp_dirtyproxy]
[-gopher_httpgw/-nogopher_httpgw] [-noFTP/-FTP] [-noHTTP/-HTTP]
[-noSSL/-SSL] [-noGopher/-Gopher] [-FTPdir/-noFTPdir]
[-noCGI/-CGI] [-FTPlist/-noFTPlist] [-FTPhtml/-noFTPhtml]
[-noRelocate/-Relocate] [-force_reget/-noforce_reget]
[-nocache/-cache] [-check_size/-nocheck_size] [-noRobots/-Robots]
[-noEnc/-Enc] [-auth_name $user] [-auth_passwd $pass] [-auth_scheme
1/2/3/4/user/Basic/Digest/NTLM]
[-auth_reuse_nonce/-no_auth_reuse_nonce] [-http_proxy_user $user]
[-http_proxy_pass $pass] [-http_proxy_auth 1/2/3/4/user/Basic/Digest/NTLM]
[-auth_reuse_proxy_nonce/-no_auth_reuse_proxy_nonce] [-ssl_key_file
$file] [-ssl_cert_file $file] [-ssl_cert_passwd $pass] [-from $email]
[-send_from/-nosend_from] [-identity $str] [-auto_referer/-noauto_referer]
[-referer/-noreferer] [-alang $list] [-acharset $list]
[-retry $nr] [-nregets $nr] [-nredirs $nr] [-rollback $nr] [-sleep $nr]
[-timeout $nr] [-preserve_time/-nopreserve_time]
[-preserve_perm/-nopreserve_perm] [-preserve_slinks/-nopreserve_slinks]
[-bufsize $nr] [-maxrate $nr] [-minrate $nr] [-user_condition
$str] [-cookie_file $file] [-cookie_send/-nocookie_send]
[-cookie_recv/-nocookie_recv] [-cookie_update/-nocookie_update] [-cookies_max
$nr] [-disabled_cookie_domains $list] [-disable_html_tag
$TAG,[$ATTRIB][;...]] [-enable_html_tag $TAG,[$ATTRIB][;...]]
[-tr_del_chr $str] [-tr_str_str $str1 $str2] [-tr_chr_chr $chrset1
$chrset2] [-index_name $str] [-store_index/-nostore_index]
[-store_name $str] [-debug/-nodebug] [-debug_level $level] [-browser
$str] [-urls_file $file] [-file_quota $nr] [-trans_quota $nr]
[-fs_quota $nr] [-enable_js/-disable_js] [-fnrules $t $m $r]
[-store_info/-nostore_info] [-all_to_local/-noall_to_local]
[-sel_to_local/-nosel_to_local] [-all_to_remote/-noall_to_remote]
[-url_strategie $strategie] [-remove_adv/-noremove_adv] [-adv_re
$RE] [-check_bg/-nocheck_bg]
[-send_if_range/-nosend_if_range] [-sched_cmd $str]
[-unique_log/-nounique_log] [-post_cmd $str] [-ssl_version $v]
[-unique_sslid/-nounique_sslid] [-aip_pattern $re] [-dip_pattern
$re] [-use_http11/-nouse_http11] [-local_ip $addr]
[-request $req] [-formdata $req] [-httpad $str] [-nthreads
$nr] [-immesg/-noimmesg] [-dumpfd $nr] [-dump_urlfd $nr]
[-unique_name/-nounique_name]
[-leave_site_enter_dir/-dont_leave_site_enter_dir] [-max_time
$nr] [-del_after/-nodel_after] [-singlepage/-nosinglepage]
[-dump_after/-nodump_after] [-dump_response/-nodump_response]
[-auth_ntlm_domain $str] [-auth_proxy_ntlm_domain $str] [-js_pattern
$re] [-follow_cmd $str] [-retrieve_symlink/-noretrieve_symlink]
[-js_transform $p $t $h $a] [-js_transform2 $p $t $h $a]
[-ftp_proxy_user $str] [-ftp_proxy_pass $str]
[-limit_inlines/-dont_limit_inlines] [-ftp_list_options $str]
[-fix_wuftpd_list/-nofix_wuftpd_list] [-post_update/-nopost_update]
[-info_dir $dir] [-mozcache_dir $dir] [-aport $list] [-dport $list]
[-hack_add_index/-nohack_add_index] [-default_prefix $str]
[-rsleep/-norsleep] [-ftp_login_handshake $host $handshake]
[-js_script_file $file] [-dont_touch_url_pattern $pat]
[-dont_touch_url_rpattern $pat] [-dont_touch_tag_rpattern $pat]
[-tag_pattern $tag $attrib $url] [-tag_rpattern $tag $attrib
$url] [-nss_cert_dir $dir]
[-nss_accept_unknown_cert/-nonss_accept_unknown_cert]
[-nss_domestic_policy/-nss_export_policy] [-[no]verify]
[-tlogfile $file] [-trelative {object | program}]
[-transparent_proxy FQDN[:port]] [-transparent_ssl_proxy
FQDN[:port]] [-sdemo] [-noencode] [URLs]
pavuk -mode {normal | singlepage | singlereget}
[-base_level $nr]
pavuk -mode sync [-ddays $nr] [-subdir $dir]
[-remove_old/-noremove_old]
pavuk -mode resumeregets [-subdir $dir]
pavuk -mode linkupdate [-X] [-h] [-v] [-cdir $dir]
[-subdir $dir] [-scndir $dir] [-scenario $str]
pavuk -mode reminder [-remind_cmd $str]
pavuk -mode mirror [-subdir $dir]
[-remove_old/-noremove_old] [-remove_before_store/-noremove_before_store]
[-always_mdtm/-noalways_mdtm]
This manual page describes how to use pavuk. Pavuk can be used to mirror
contents of internet/intranet servers and to maintain copies in a local tree
of documents. Pavuk stores retrieved documents in locally mapped disk space.
The structure of the local tree is the same as the one on the remote server.
Each supported service (protocol) has its own subdirectory in the local tree.
Each referenced server has its own subdirectory in these protocols
subdirectories; followed by the port number on which the service resides,
delimited by character can be be changed. With the option -fnrules you
can change the default layout of the local document tree, without losing link
consistency.
With pavuk it is possible to have up-to-date copies of remote documents
in the local disk space.
As of version 0.3pl2, pavuk can automatically restart broken connections, and
reget partial content from an FTP server (which must support the REST
command), from a properly configured HTTP/1.1 server, or from a HTTP/1.0
server which supports Ranges.
As of version 0.6 it is possible to handle configurations via so called
scenarios. The best way to create such a configuration file is to use the X
Window interface and simply save the created configuration. The other way is
to use the -dumpscn switch.
As of version 0.7pl1 it is possible to store authentification information into
an authinfo file, which pavuk can then parse and use.
As of version 0.8pl4 pavuk can fetch documents for use in a local proxy/cache
server without storing them to local documents tree.
As of version 0.9pl4 pavuk supports SOCKS (4/5) proxies if you have the
required libraries.
As of version 0.9pl12 pavuk can preserve permissions of remote files and
symbolic links, so it can be used for powerful FTP mirroring.
Pavuk supports SSL connections to FTP servers, if you specify ftps:// URL
instead of ftp://.
Pavuk can automatically handle file names with unsafe characters for filesystem.
This is yet implemented only for Win32 platform and it is hard coded.
Pavuk can now use HTTP/1.1 protocol for communication with HTTP servers.
It can use persistent connections, so one TCP connection should be used to
transfer several documents without closing it. This feature saves network
bandwidth and also speedup network communication.
Pavuk can do configurable POST requests to HTTP servers and support also
file uploading via HTTP POST request.
Pavuk can automatically fill found HTML forms, if user will supply data for its
fields before with option -formdata.
Pavuk can run configurable number of concurrently running downloading threads
when compiled with multithreading support.
HTTP
http://[[user][:password]@]host[:port][/document]
[[user][:password]@]host[:port][/document]
HTTPS
https://[[user][:password]@]host[:port][/document]
ssl[.domain][:port][/document]
FTP
ftp://[[user][:password]@]host[:port][/relative_path][;type=x]
ftp://[[user][:password]@]host[:port][//absolute_path][;type=x]
ftp[.domain][:port][/document][;type=x]
FTPS
ftps://[[user][:password]@]host[:port][/relative_path][;type=x]
ftps://[[user][:password]@]host[:port][//absolute_path][;type=x]
ftps[.domain][:port][/document][;type=x]
Gopher
gopher://host[:port][/type[document]]
gopher[.domain][:port][/type[document]]
HTTP
http://[[user][:password]@]host[:port][/document][?query]
to
http/host_port/[document][?query]
HTTPS
https://[[user][:password]@]host[:port][/document][?query]
to
https/host_port/[document][?query]
FTP
ftp://[[user][:password]@]host[:port][/path]
to
ftp/host_port/[path]
FTPS
ftps://[[user][:password]@]host[:port][/path]
to
ftps/host_port/[path]
Gopher
gopher://host[:port][/type[document]]
to
gopher/host_port/[type[document]]
NOTE: Pavuk will use the string with which it queries the
target server as the name of the results file. This file name may, in some
cases, contain punctuations such as $,?,=,& etc. Such punctuation
can cause problems when you are trying to browse downloaded files with your
browser or you are trying to process downloaded files with shell scripts or
view files with file management utilities which reference the name of the
results file. If you believe that this maybe causing problems for you, then
you can remove all punctuation from the result file name with the option:
-tr_del_chr [:punct:] or with other options for adjusting
filenames.
All options are case insensitive.
Mode
Help
Indicate/Logging/Interface options
Netli options
Special start
Scenario/Task options
Directory options
Preserve options
Proxy options
Proxy Authentification
Protocol/Download Option
Authentification
Site/Domain/Port Limitation Options
Limitation Document properties
Limitation Document name
Limitation Protocol Option
Other Limitation Options
Javascript support
Cookie
HTML rewriting engine tuning options
Filename/URL Conversion Option
Other Options
- -mode {normal, linkupdate, sync, singlepage, singlereget,
resumeregets}
- Set operation mode.
normal - retrieves recursive documents
linkupdate - update remote URLs in local HTML documents to local URLs
if these URLs exist in the local tree
sync - synchronize remote documents with local tree (if a local copy
of a document is older than remote, the document is retrieved again,
otherwise nothing happens)
singlepage - URL is retrieved as one page with all inline objects
(picture, sound ...) this mode is now obsoleted by -singlepage
option.
resumeregets - pavuk scans the local tree for files that were not
retrieved fully and retrieves them again (uses partial get if possible)
singlereget - get URL until it is retrieved in full
dontstore - transfer page from server, but don't store it to the
local tree. This mode is suitable for fetching pages that are held in a
local proxy/cache server.
reminder - used to inform the user about changed documents
ftpdir - used to list of contents of FTP directories
default operation mode is normal mode.
- -h
- Print long verbose help message
- -v
- Show version informations and configuration at compilation time.
- -quiet
- Don't show any messages on the screen.
- -verbose
- Force to show output messages on the screen (default)
- -progress/-noprogress
- Show retrieving progress while running in the terminal (default is
progress off)
- -stime/-nostime
- Show start and end time of transfer. (default isn't this information
shown)
- -xmaxlog $nr
- Maximum number of log lines in the Log widget. 0 means unlimited. This
option is available only when compiled with the GTK+ GUI. (default value
is 0)
- -logfile $file
- File where all produced messages are stored.
- -unique_log/-nounique_log
- When logfile as specified with the option -logfile is already used
by another process, try to generate new unique name for the log file.
(default is this option turned off)
- -slogfile $file
- File to store short logs in. This file contains one line of informations
per processed document. This is meant to be used in connection with any
sort of script to produce some statistics, for validating links on your
website, or for generating simple sitemaps. Multiple pavuk processes can
use this file concurrently, without overwriting each others entries.
Record structure:
- PID of pavuk process
- TIME current time
- COUNTER in the format current/total number of URLs
- STATUS contains the type of the error: FATAL, ERR,
WARN or OK
- ERRCODE is the number code of the error
(see errcode.h in pavuk sources)
- URL of the document
- PARENTURL first parent document of this URL
(when it doesn't have parent - [none])
- FILENAME is the name of the local file the
document is saved under
- SIZE size of requested document if known
- DOWNLOAD_TIME time which takes downloading of this
document in format seconds.mili_seconds
- HTTPRESP contains the first line of the HTTP server
response
- -language $str
- Native language that pavuk should use for communication with its user
(works only when there is a message catalog for that language) GNU
gettext support (for message internationalization) must also be
compiled in. Default language is taken from your NLS environment
variables.
- -gui_font $font
- Font used in the GUI interface. To list available X fonts use the
xlsfonts command. This option is available only when compiled with
GTK+ GUI support.
- -[no]read_css
- Enable or disable fetching objects mentioned in style sheets.
- -[no]verify
- Enable or disable verifying server CERTS in SSL mode.
- -tlogfile $file
- Turn on Netli logging with output to specified file.
- -trelative {object | program}
- Make Netli timings relative to the start of the first object or the
program.
- -transparent_proxy FQDN[:port]
- When processing URL, send the original, but send it to the IP address at
FQDN
- -transparent_ssl_proxy FQDN[:port]
- When processing HTTPS URL, send the original, but send it to the IP
address at FQDN
- -sdemo
- Output in sdemo compatible format. This is only used by sdemo. (For now it
simply means output '-1' rather than '*' when measurements are invalid.)
- -noencode
- Do not escape characters that are "unsafe" in URLS.
- -X
- Start program with X Window interface (if compiled with support for GTK+).
Pavuk as default starts without GUI, and behaves as regular commandline
tool.
- -runX
- When used together with the -X option, pavuk starts processing of
URLs immediately after the GUI window is launched. Without the -X
given, this option doesn't have any effect. Only available when compiled
with GTK+ support .
- -bg/-nobg
- This option allows pavuk to detach from its terminal and run in background
mode. Pavuk will not output any messages to the terminal than. If you want
to see messages, you have to use the -log_file option to specify a
file where messages will be written. Default pavuk executes at
foreground.
- -check_bg/-nocheck_bg
- Normally, programs sent into the background after being run in foreground
continue to output messages to the terminal. If this option is activated,
pavuk checks if it is running as background job and will not write any
messages to the terminal in this case. After it becomes a foreground job
again, it will start writing messages to terminal in the normal way. This
option is available only when your system supports retrieving of terminal
info via tc*() functions.
- -prefs/-noprefs
- When you turn this option on, pavuk will preserve all settings when
exiting, and when you run pavuk with GUI interface again, all settings
will be restored. The settings will be stored in the ~./pavuk_prefs
file. Default pavuk want restore its option when started. This option is
available only when compiled with GTK+.
- -schedule $time
- Execute pavuk at the time specified as parameter. The Format of the $time
parameter is YYYY.MM.DD.hh.mm. You need a properly configured scheduling
with the at command on your system for using this option. If
default configuration (at -f %f %t %d.%m.%Y) of scheduling command won't
work on your system, try to adjust it with -sched_cmd option.
- -reschedule $nr
- Execute pavuk periodically with $nr hours period. You need properly
configured scheduling with the at command on your system for using
this option.
- -sched_cmd $str
- Command to use for scheduling. Pavuk explicitly supports scheduling with
at $str should contain regular characters and macros, escaped by
% character. Supported macros are:
%f
- for script filename
%t
- for time (in format HH:MM)
- all macros as supported by the strftime() function
- -urls_file $file
- If you use this option, pavuk will read URLs from $file before it starts
processing. In this file, each URL needs to be on a separate line. After
the last URL, a single dot . followed by a LF (line-feed) character
denotes the end. Pavuk will start processing right after all URLs have
been read. If $file is given as the - character, standard
input will be read.
- -store_info/-nostore_info
- This option causes pavuk to store information about each document into a
separate file in the .pavuk_info directory. This file is used to
store the original URL from which the document was downloaded. For files
that are downloaded via HTTP or HTTPS protocols, the whole HTTP response
header is stored there. I recommend to use this option when you are using
options that change the default layout of the local document tree, because
this info file helps pavuk to map the local filename to the URL. This
option is also very useful when different URLs have the same filename in
the local tree. When this occurs, pavuk detects this using info files, and
it will prefix the local name with numbers. At default is disabled storing
of this extra informations.
- -info_dir $dir
- You can set with this option location of separate directory for storing
info files created when -store_info option is used. This is useful
when you don't want to mix in destination directory the info files with
regular document files. The structure of the info files is preserved, just
are stored in different directory.
- -request $req
- With this option you can specify extended informations for starting URLs.
With this option you can specify query data for POST or GET
. Current syntax of this option is : URL:["]$url["]
[METHOD:["]{GET|POST}["]] [ENCODING:["]{u|m}["]]
[FIELD:["]variable=value["]]
[FILE:["]variable=filename["]
[LNAME:["]local_filename["]]
- URL: specifies request URL
- METHOD: specifies request method for URL and is
one of GET or POST.
- ENCODING: specifies encoding for request body data.
m is for multipart/form-data encoding
u is for application/x-www-form-urlencoded
encoding
- FIELD: specifies field of request data in format
variable=value. For encoding of special characters
in variable and value you can use same encoding
as is used in application/x-www-form-urlencoded
encoding.
- FILE: specifies special field of query, which is
used to specify file for POST based file upload.
- LNAME: specifies localname for this request
When you need to use inside the FIELD: and FILE: fields of request
specification special characters, you should use the
application/x-www-form-urlencoded encoding of characters. It means all
nonASCII characters, quote character ("), space character ( ), ampersand
character (&), percent character (%) and equal character (=) should be
encoded in form %xx where xx is hexadecimal representation of
ASCII value of character. So for example % character should be encoded
like %25.
- -formdata $req
- This option gives you chance to specify contents for HTML forms found
during traversing document tree.
Current syntax of this option is same as for -request option, but
ENCODING: and METHOD: are meaningless in this option
semantics.
In URL: you have to specify HTML form action URL, which will be
matched against action URLs found in processed HTML documents. If pavuk
finds action URL which matches that supplied in -formdata option,
pavuk will construct GET or POST request from data supplied
in this option and from default form field values supplied in HTML
document. Values supplied on commandline have precedence before that
supplied in HTML file.
- -nthreads $nr
- By means of this option you can specify how many concurrent threads will
download documents. Default pavuk executes 3 concurrent downloading
threads. This option is available only when pavuk is compiled to support
multithreading.
- -immesg/-noimmesg
- Default pavuks behavior when running multiple downloading threads is to
buffer all output messages in memory buffer and flush that buffered data
just when thread finishes processing of one document. With this option you
can change this behavior to see the messages immediately when it is
produced. It is only usable when you want to debug some specials in
multithreading environment. This option is available only when pavuk is
compiled to support multithreading.
- -dumpfd $nr
- For scripting is sometimes usable to be able to download document directly
to pipe or variable instead of storing it to regular file. In such case
you can use this option to dump data for example to stdout ($nr = 1).
- -dump_after/-nodump_after
- While using -dumpfd option in multithreaded pavuk, it is required
to dump document in one moment because documents downloaded in multiple
threads can overlap. This option is also useful when you want to dump
document after pavuk adjusts links inside HTML documents.
- -dump_response/-nodump_response
- This option have effect only when used with -dumpfd option. It is
used to dump HTTP response headers.
- -dump_urlfd $nr
- When you will use this option, pavuk will output all URLs found in HTML
documents to file descriptor $nr. You can use this option to extract and
convert all URLs to absolute.
- -scenario $str
- Name of scenario to load and/or run. Scenarios are files with a structure
similar to the .pavukrc file. Scenarios contain saved
configurations. You can use it for periodical mirroring. Parameters from
scenarios specified at the command line can be overwritten by command line
parameters. To be able to use this option, you need to specify scenario
base directory with option -scndir.
- -dumpscn $filename
- Store actual configuration into scenario file with name $filename.
This is useful to quickly create pre-configured scenarios for manual
editing.
- -msgcat $dir
- Directory which contains the message catalog for pavuk. If you do not have
permission to store a pavuk message catalog in the system directory, you
should simply create similar structure of directories in your home
directory as it is on your system.
For example:
Your native language is German, and your home directory is
/home/jano.
You should at first create the directory
/home/jano/locales/de/LC_MESSAGES/, then put the German pavuk.mo there
and set -msgcat to /home/jano/locales/. If you have properly set locale
environment values, you will see pavuk speaking German. This option is
available only when you compiled in support for GNU gettext messages
internationalization.
- -cdir $dir
- Directory where are all retrieved documents are stored. If not specified,
the current directory is used. If the specified directory doesn't exist,
it will be created.
- -scndir $dir
- Directory in which your scenarios are stored. You must use this option
when you are loading or storing scenario files.
- -preserve_time/-nopreserve_time
- Store downloaded document with same modification time as on the remote
site. Modification time will be set only when such information is
available (some FTP servers do not support the MDTM command, and
some documents on HTTP servers are created online so pavuk can't retrieve
the modification time of this document). At default modification time of
documents isn't preserved.
- -preserve_perm/-nopreserve_perm
- Store downloaded document with the same permissions as on the remote site.
This option has effect only when downloading a file through FTP protocol
and assumes that the -ftplist option is used. At default
permissions are not preserved.
- -preserve_slinks/-nopreserve_slinks
- Set symbolic links to point exactly to same location as on the remote
server; don't do any relocations. This option has effect only when
downloading file through FTP protocol and assumes that the -ftplist
option is used. Default symbolic links are not preserved, and are
retrieved as regular documents with full contents of linked file.
For example, assume that on the FTP server ftp.xx.org there is
a symbolic link /pub/pavuk/pavuk-current.tgz, which points to
/tmp/pub/pavuk-0.9pl11.tgz. Pavuk will create symbolic link
ftp/ftp.xx.org_21/pub/pavuk/pavuk-current.tgz
if option -preserve_slinks will be used this symbolic link will point to
/tmp/pub/pavuk-0.9pl11.tgz
if option -preserve_slinks want be used, this symbolic link will point to
../../tmp/pub/pavuk-0.9pl11.tgz
- -retrieve_symlink/-noretrieve_symlink
- Retrieve files behind symbolic links instead of replicating symlinks in
local tree.
- -http_proxy $site[:$port]
- If this parameter is used, then all HTTP requests are going through this
proxy server. This is useful if your site resides behind a firewall, or if
you want to use a HTTP proxy cache server. The default port number is
8080. Pavuk allows you to specify multiple HTTP proxies (using multiple
-http_proxy options) and it will rotate proxies with roundrobin priority
disabling proxies with errors.
- -nocache/-cache
- Use this option whenever you want to get the document directly from the
site and not from your HTTP proxy cache server. Default pavuk allows
transfer of document copies from cache.
- -ftp_proxy $site[:$port]
- If this parameter is used, then all FTP requests are going through this
proxy server. This is useful when your site resides behind a firewall, or
if you want to use FTP proxy cache server. The default port number is 22.
Pavuk supports three different types of proxies for FTP, see the options
-ftp_httpgw, -ftp_dirtyproxy. If none of the mentioned options is
used, then pavuk assumes a regular FTP proxy with USER user@host
connecting to remote FTP server.
- -ftp_httpgw/-noftp_httpgw
- The specified FTP proxy is a HTTP gateway for the FTP protocol. Default
FTP proxy is regular FTP proxy.
- -ftp_dirtyproxy/-noftp_dirtyproxy
- The specified FTP proxy is a HTTP proxy which supports a CONNECT
request (pavuk should use full FTP protocol, except of active data
connections). Default FTP proxy is regular FTP proxy. If both
-ftp_dirtyproxy and -ftp_httpgw are specified, -ftp_dirtyproxy is
preferred.
- -gopher_proxy $site[:$port]
- Gopher gateway or proxy/cache server.
- -gopher_httpgw/-nogopher_httpgw
- The specified Gopher proxy server is a HTTP gateway for Gopher protocol.
When -gopher_proxy is set and this -gopher_httpgw option
isn't used, pavuk is using proxy as HTTP tunnel with CONNECT
request to open connections to Gopher servers.
- -ssl_proxy $site[:$port]
- SSL proxy (tunneling) server [as that in CERN httpd + patch or in Squid]
with enabled CONNECT request (at least on port 443). This option is
available only when compiled with SSL support (you need the SSleay or
OpenSSL libraries with development headers)
- -http_proxy_user $user
- Username for HTTP proxy authentification.
- -http_proxy_pass $pass
- Password for HTTP proxy authentification.
- -http_proxy_auth {1/2/3/4/user/Basic/Digest/NTLM}
- Authentification scheme for proxy access. Similar meaning as the
-auth_scheme option (see help for this option for more details).
Default is 2 (Basic scheme).
- -auth_proxy_ntlm_domain $str
- NT or LM domain used for authorization again HTTP proxy server when NTLM
authentification scheme is required. This option is available only when
compiled with OpenSSL or libdes libraries.
- -auth_reuse_proxy_nonce/-noauth_reuse_proxy_nonce
- When using HTTP Proxy Digest access authentification scheme use first
received nonce value in multiple following requests.
- -ftp_proxy_user $user
- Username for FTP proxy authentification.
- -ftp_proxy_pass $pass
- Password for FTP proxy authentification.
- -ftp_passive
- Uses passive ftp when downloading via ftp.
- -ftp_active
- Uses active ftp when downloading via ftp.
- -active_ftp_port_range $min:$max
- This option permits to specify the ports used for active ftp. This permits
easier firewall configuration since the range of ports can be restricted.
Pavuk will randomly choose a number from within the specified
range until an open port is found. Should no open ports be found within
the given range, pavuk will default to a normal kernel-assigned port,
and a message (debug level net) is output.
The port range selected must be in the non-privileged range
(eg. greater than or equal to 1024); it is STRONGLY RECOMMENDED that the
chosen range be large enough to handle many simultaneous active
connections (for example, 49152-65534, the IANA-registered ephemeral
port range).
- -always_mdtm/-noalways_mdtm
- Force pavuk to always use "MDTM" to determine the file
modification time and never uses cached times determined when listing the
remote files.
- -remove_before_store/-noremove_before_store
- Force unlink'ing of files before new content is stored to a file. This is
helpful if the local files are hardlinked to some other directory and
after mirroring the hardlinks are checked. All "broken"
hardlinks indicate a file update.
- -retry $nr
- Set the number of attempts to transfer processed document. Default set to
1, this mean pavuk will retry once to get documents which failed on first
attempt.
- -nregets $nr
- Set the number of allowed regets on a single document, after a broken
transfer. Default value for this option is 2.
- -nredirs $nr
- Set number of allowed HTTP redirects. (use this for prevention of loops)
Default value for this option is 5, and conform to HTTP
specification.
- -force_reget/-noforce_reget
- Force reget'ing of the whole document after a broken transfer when the
server doesn't support retrieving of partial content. Pavuk default
behavior is to stop getting documents which don't allow restarting of
transfer from specified position.
- -timeout $nr
- Timeout for stalled connections in minutes. This value is also used for
connection timeouts. For sub-minute timeouts you can use floating point
numbers. Default timeout is 0, an that means timeout checking is
disabled.
- -noRobots/-Robots
- This switch suppresses the use of the robots.txt standard, which is
used to restrict access of Web robots to some locations on the web server.
Default is allowed checking of robots.txt files on HTTP servers. Enable
this option always when you are downloading huge sets of pages with
unpredictable layout. This prevents you from upsetting server
administrators :-).
- -noEnc/-Enc
- This switch suppresses using of gzip or compress or
deflate encoding in transfer. I don't know if some servers are
broken or what, but they are propagating that MIME type application/gzip
or application/compress as encoded. Turn this option off, when you doesn't
have libz support compiled in and also gzip program which is used
to decode document encoded this way. At default is decoding of downloaded
document disabled.
- -check_size/-nocheck_size
- The option -nocheck_size should be used if you are trying to download
pages from a HTTP server which sends a wrong Content-Length: field
in the MIME header of response. Default pavuk behavior is to check this
field and complain when something is wrong.
- -maxrate $nr
- If you don't want to give all your transfer bandwidth to pavuk, use this
option to set pavuk's maximum transfer rate. This option accepts a
floating point number to specify the transfer rate in kB/s. If you want
get optimal settings, you also have to play with the size of the read
buffer (option -bufsize) because pavuk is doing flow control only
at application level. At default pavuk use full bandwidth.
- -minrate $nr
- If you hate slow transfer rates, this option allows you to break transfers
with slow speed. You can set the minimum transfer rate, and if the
connection gets slower than the given rate, the transfer will be stopped.
The minimum transfer rate is given in kB/s. At default pavuk doesn't check
this limit.
- -bufsize $nr
- This option is used to specify the size of the read buffer (default size:
32kB). If you have a very fast connection, you may increase the size of
the buffer to get a better read performance. If you need to decrease the
transfer rate, you may need to decrease the size of the buffer and set the
maximum transfer rate with the -maxrate option. This option accepts
the size of the buffer in kB.
- -fs_quota $nr
- If you are running pavuk on a multiuser system, you may need to avoid
filling up your file system. This option lets you specify how many space
must remain free. If pavuk detects an underrun of the free space, it will
stop downloading files. Specify this quota in kB. Default value is 0, and
that mean no checking of this quota.
- -file_quota $nr
- This option is useful when you want to limit downloading of big files, but
want to download at least $nr kilobytes from big files. A big file will be
transferred, and when it reaches the specified size, transfer will break.
Such document will be processed as properly downloaded, so be careful when
using this option. At default pavuk is transferring full size of
documents.
- -trans_quota $nr
- If you are aware that your selection should address a big amount of data,
you can use this option to limit the amount of transferred data. Default
is by size unlimited transfer.
- -max_time $nr
- Set maximum amount of time for program run. After time is exceeded, pavuk
will stop downloading. Time is specified in minutes. Default value is 0,
and it means downloading time is not limited.
- -url_strategy $strategy
- This option allows you to specify a downloading order for URLs in document
tree. This option accepts the following strings as parameters :
level - will order URLs as it loads it from HTML
files (default)
leveli - as previous, but inline objects URLs come first
pre - will insert URLs from actual HTML document at start, before other
prei - as previous, but inline objects URLs come first
- -send_if_range/-nosend_if_range
- Send If-Range: header in HTTP request. I found out, that some HTTP
servers (greetings, MS :-)) are sending different ETag: fields in
different responses for the same, unchanged document. This causes problems
when pavuk attempts to reget a document from such a server: pavuk will
remember the old ETag value and uses it it following requests for this
document. If the server checks it with the new ETag value and it differs,
it will refuse to send only part of the document, and start the download
from scratch.
- -ssl_version $v
- Set required SSL protocol version for SSL communication. $v is one
of ssl2, ssl23, ssl3 or tls1. This option is available only when compiled
with SSL support. Default is ssl23.
- -unique_sslid/-nounique_sslid
- This option can be used if you want to use a unique SSL ID for all
SSL sessions. Default pavuk behavior is to negotiate each time new session
ID for each connection. This option is available only when compiled with
SSL support.
- -use_http11/-nouse_http11
- This option is used to switch between HTTP/1.0 and HTTP/1.1 protocol used
with HTTP servers. Now is using of HTTP/1.1 protocol not default because
its implementation is very fresh and not 100% tested. Even though using of
HTTP/1.1 is very recommended, because it is faster than HTTP/1.0 and uses
less network bandwidth for initiating connections. In any further version
I will activate using of HTTP/1.1 as default.
- -local_ip $addr
- You can use this option when you want to use specified network interface
for communication with other hosts. This option is suitable for multihomed
hosts with several network interfaces. Address should be entered as
regular IP address or as host name.
- -identity $str
- This option allows you to specify content of User-Agent: field of
HTTP request. This is usable, when scripts on remote server returns
different document on same URL for different browsers, or if some HTTP
server refuse to serve document for Web robots like pavuk. Default pavuk
sends in User-Agent: field pavuk/$VERSION string.
- -auto_referer/-noauto_referer
- This option forces pavuk to send HTTP Referer: header field with
starting URLs. Content of this field will be self URL. Using this option
is required, when remote server checks the Referer: field. At default
pavuk wont send Referer: field with starting URLs.
- -referer/-noreferer
- This option allows to enable and disable the transmission of HTTP
Referer: header field. At default pavuk sends Referer: field.
- -httpad $str
- In some cases you may want to add user defined fields to HTTP/HTTPS
requests. This option is exactly for this purpose. In $str you can
directly specify content of additional header. If you specify only raw
header, it will be used only for starting requests. When you want to use
this header with each request while crawling, prefix the header with
+ character.
- -del_after/-nodel_after
- This option allows you to delete FILES from REMOTE server, when download
is properly finished. At default is this option off.
- -FTPlist/-noFTPlist
- When option -FTPlist will be used, pavuk will retrieve content of FTP
directories with FTP command LIST instead of NLST. So the
same listing will be retrieved as with "ls -l" UNIX command.
This option is required if you need to preserve permissions of remote
files or you need to preserve symbolic links. Pavuk supports wide listing
on FTP servers with regular BSD or SYSV style "ls
-l" directory listing, on FTP servers with EPFL listing
format, VMS style listing, DOS/Windows style listing and
Novel listing format. Default pavuk behavior is to use NLST fro FTP
directory listings.
- -ftp_list_options $str
- Some FTP servers require to supply extra options to LIST or NLST FTP
commands to show all files and directories properly. But be sure not to
use any extra options which can reformat output of the listing. Useful is
especially -a option which force FTP server to show also dot files
and directories and with broken WuFTP servers it also helps to produce
full directory listings not just files.
- -fix_wuftpd/-nofix_wuftpd
- This option is result of several attempts to to get working properly the
-remove_old option with WuFTPd server when -ftplist option
is used. The problem is that FTP command LIST on WuFTPd don't mind when
trying to list nonexisting directory, and indicates success in FTP
response code. When you activate this option, pavuk uses extra FTP command
(STAT -d dir) to check whether the directory really exists. Don't use this
option until you are sure that you really need it!
- -auth_file $file
- File where you have stored authentification information for access to some
service. For file structure see below in FILES section.
- -auth_name $user
- If you are using this parameter, program is doing authentification with
each HTTP access to document. Use this only if you know that only one HTTP
server could be accessed or use -asite option to specify site to
which you use authentification. Else your auth parameters will be sent to
each accessed HTTP server.
- -auth_passwd $passwd
- Value of this parameter is used as password for authentification
- -auth_scheme {1/2/3/4/user/Basic/Digest/NTLM}
- This parameter specifies used authentification scheme.
1 or user means user authentification scheme is used as
defined in HTTP/1.0 or HTTP/1.1. Password and user name are sent
unencoded.
2 or Basic means Basic authentification scheme is used as
defined in HTTP/1.0. Password and user name are sent BASE64 encoded.
3 or Digest means Digest access authentification scheme based
on MD5 checksums as defined in RFC2069.
4 or NTLM means NTLM proprietary access authentification
scheme used by Microsoft IIS or Proxy servers. When you use this scheme,
you must also specify NT or LM domain with option
-auth_ntlm_domain. This scheme is supported only when compiled with
OpenSSL or libdes libraries.
- -auth_ntlm_domain $str
- NT or LM domain used for authorization again HTTP server when NTLM
authentification scheme is required. This option is available only when
compiled with OpenSSL or libdes libraries.
- -auth_reuse_nonce/-noauth_reuse_nonce
- While using HTTP Digest access authentification scheme use first received
nonce value in more following requests. Default pavuk negotiates nonce for
each request.
- -ssl_key_file $file
- File with public key for SSL certificate (learn more from SSLeay or
OpenSSL documentation) This option is available only when compiled with
SSL support (you need SSleay or OpenSSL libraries and development
headers)
- -ssl_cert_file $file
- Certificate file in PEM format (learn more from SSLeay or OpenSSL
documentation) This option is available only when compiled with SSL
support (you need SSleay or OpenSSL libraries and development
headers)
- -ssl_cer_passwd $str
- Password used to generate certificate (learn more from SSLeay or OpenSSL
documentation) This option is available only when compiled with SSL
support (you need SSLeay or OpenSSL libraries and development
headers)
- -nss_cert_dir $dir
- Config directory for NSS (Netscape SSL implementation) certificates.
Usually ~/.netscape (created by Netscape communicator/navigator) or
profile directory below ~/.mozilla (created by Mozilla browser). The
directory should contain cert7.db and key3.db files. If you
don't use Mozilla nor Netscape, you must create this files by utilities
distributed with NSS libraries. Pavuk opens certificate database only
readonly. This option is available only when pavuk is compiled with SSL
support provided by Netscape NSS SSL implementation.
- [-nss_accept_unknown_cert/-nonss_accept_unknown_cert]
- By default will pavuk reject connection to SSL server which certificate is
not stored in local certificate database (set by -nss_cert_dir
option). You must explicitly force pavuk to allow connection to servers
with unknown certificates. This option is available only when pavuk is
compiled with SSL support provided by Netscape NSS SSL
implementation.
- [-nss_domestic_policy/-nss_export_policy]
- Selects sets of ciphers allowed/disabled by USA export rules. This option
is available only when pavuk is compiled with SSL support provided by
Netscape NSS SSL implementation.
- -from $email
- This parameter is used when accessing anonymous FTP server as password or
is optionally inserted into From field in HTTP request. If not
specified pavuk discovers this from USER environment variable and
from site hostname.
- -send_from/-nosend_from
- This option is used for enabling or disabling sending of user
identification, entered in -from option, as FTP anonymous user
password and From: field of HTTP request. As default is this option
off.
- -ftp_login_handshake $host $handshake
- When you need to use nonstandard login procedure for some of FTP servers,
you can use this option to change default pavuk login procedure. To allow
more flexibility, you can assign the login procedure to some server or to
all. When $host is specified as empty string (""),
than attached login procedure is assigned to all FTP servers besides those
having assigned own login procedures. In the $handshake parameter
you can specify exact login procedure specified by FTP commands followed
by expected FTP response codes delimited with backslash (\)
characters.
For example this is default login procedure when logging in regular ftp
server without going through proxy server : USER %u\331\PASS
%p\230. There are two commands followed by two response codes. After
USER command pavuk expects FTP response code 331 and after PASS command
pavuk expects from server FTP response code 230. In ftp commands you can
use following macros which will be replaced by respective values:
%u - user name used to access FTP server
%p - password used to access FTP server
%U - user name used to access FTP proxy server
%P - password used to access FTP proxy server
%h - hostname of FTP server
%s - port number on which FTP server listens
- -asite $list
- Specify comma separated list of allowed sites on which referenced
documents are stored.
- -dsite $list
- Specify comma separated list of disallowed sites. Previous parameter is
opposite to this one. If both are used the last occurrence of them is used
to be valid.
- -adomain $list
- Specify comma separated list of allowed domains on which referenced
documents are stored.
- -ddomain $list
- Specify comma separated list of disallowed domains. Previous parameter is
opposite to this one. If both are used the last occurrence of them is used
to be valid.
- -aport $list
- In $list, you can write comma separated list of ports from which
you allow to download documents.
- -dport $list
- This option is opposite option to previous option. It is used to specify
denied ports. If both -aport and -dport options are used the
last occurrence of them is used to be valid and all other occurrences will
be omitted.
- -amimet $list
- List of comma separated allowed MIME types. You can use with this option
also wildcard patterns.
- -dmimet $list
- List of comma separated disallowed MIME types. You can use with this
option also wildcard patterns. Previous parameter is opposite to this one.
If both are used the last occurrence of them is used to be valid.
- -maxsize $nr
- Maximum allowed size of document. This option is applied only when pavuk
is able to detect the document before starting the transfer. Default value
is 0, and it means this limit isn't applied.
- -minsize $nr
- minimal allowed size of document. This option is applied only when pavuk
is able to detect the document before starting the transfer. Default value
is 0, and it means this limit isn't applied.
- -newer_than $time
- Allow only transfer of documents with modification time newer than
specified in parameter $time. Format of $time is: YYYY.MM.DD.hh:mm. To
apply this option pavuk must be able to detect modification time of
document.
- -older_than $time
- Allow only transfer of documents with modification time older than
specified in parameter $time. Format of $time is: YYYY.MM.DD.hh:mm. To
apply this option pavuk must be able to detect modification time of
document.
- -noCGI/-CGI
- this switch prevents to transfer dynamically generated parametric
documents through CGI interface. This is detected with occurrence of
? character inside URL. Default pavuk behavior is to allow transfer
of URLs with query strings.
- -alang $list
- this allows you to specify ordered comma separated list of preferred
natural languages. This option work only with HTTP and HTTPS protocol
using Accept-Language: MIME field.
- -acharset $list
- This options allows you to enter comma separated list of preferred
encoding of transfered documents. This works only with HTTP and HTTPS urls
and only if such document encodings are located on destination server.
example: -acharset iso-8859-2,windows-1250,utf8
- -asfx $list
- This parameter allows you to specify set of suffixes used to restrict
selection of documents which will be processed.
- -dsfx $list
- Set of suffixes that are used to specify restriction on selection of
documents. This one is inverse to previous option. They are segregating
each other.
- -aprefix $list, -dprefix $list
- This two options allow you to specify set of allowed or disallowed
prefixes of documents. They are segregating each other.
- -pattern $pattern
- This option allows you to specify wildcard pattern for documents. All
documents are tested if they match this pattern.
- -rpattern $reg_exp
- This is equal option as previous, but this uses regular expressions.
Available only on platforms which have any supported RE
implementation.
- -skip_pattern $pattern
- This option allows you to specify wildcard pattern for documents that
should be skipped. All documents are tested if they match this
pattern.
- -skip_rpattern $reg_exp
- This is equal option as previous, but this uses regular expressions.
Available only on platforms which have any supported RE
implementation.
- -url_pattern $pattern
- This option allows you to specify wildcard pattern for URLs. All URLs are
tested if they match this pattern.
Example:
-url_pattern http://\*.idata.sk:\*/~ondrej/\* . this option enables all HTTP
URLs from domain .idata.sk on all ports which are located under
/~ondrej/.
- -url_rpattern $reg_exp
- This is equal option as previous, but this uses regular expressions.
Available only on platforms which have any supported RE
implementation.
- -skip_url_pattern $pattern
- This option allows you to specify wildcard pattern for URLs that should be
skipped. All URLs are tested if they match this pattern.
- -skip_url_rpattern $reg_exp
- This is equal option as previous, but this uses regular expressions.
Available only on platforms which have any supported RE
implementation.
- -aip_pattern $re
- This option allows you to limit set of transferred documents by server IP
address. IP address can be specified as regular expressions, so it is
possible to specify set of IP addresses by one expression. Available only
on platforms which have any supported RE implementation.
- -dip_pattern $re
- This option similar to previous option, but is used to specify set of
disallowed IP addresses. Available only on platforms which have any
supported RE implementation.
- -tag_pattern $tag $attrib $url
- More powerful version of -url_pattern option for more precise
matching of allowed URLs based on HTML tag name pattern, HTML tag
attribute name pattern and on URL pattern. You can use in all three
parameters of this option wildcard patterns, thus something like
-tag_pattern '*' '*' url_pattern is equal to -url_pattern
url_pattern. The $tag and $attrib parameters are always
matched again uppercase strings. For example if you want just let pavuk
follow only regular links ignoring any stylesheets, images, etc., use
option -tag_pattern A HREF '*'.
- -tag_rpattern $tag $attrib $url
- This is variation on the -tag_pattern. It uses regular expression
patterns in parameters instead of wildcard patterns used in the previous
option.
- -noHTTP/-HTTP
- This switch suppresses all transfers through HTTP protocol. Default is
transfer trough HTTP enabled.
- -noSSL/-SSL
- This switch suppresses all transfers through HTTPS protocol (HTTP protocol
over SSL) . Default is transfer trough HTTPS enabled. This option is
available only when compiled with SSL support (you need SSleay or OpenSSL
libraries and development headers)
- -noGopher/-Gopher
- Suppress all transfers through Gopher Internet protocol. Default is
transfer trough Gopher enabled.
- -noFTP/-FTP
- This switch prevents processing documents allocated on all FTP servers.
Default is transfer trough FTP enabled.
- -noFTPS/-FTPS
- This switch prevents processing documents allocated on all FTP servers
accessed through SSL. Default is transfer trough FTPS enabled. This option
is available only when compiled with SSL support (you need SSleay or
OpenSSL libraries and development headers)
- -FTPhtml/-noFTPhtml
- By using of option -FTPhtml you can force pavuk to process HTML files
downloaded with FTP protocol. At default pavuk won't parse HTML files from
FTP servers.
- -FTPdir/-noFTPdir
- Force recursive processing of FTP directories too. At default is recursive
downloading from FTP servers denied.
- -disable_html_tag $TAG,[$ATTRIB][;...]
- -enable_html_tag $TAG,[$ATTRIB][;...] Enable or disable processing
of particular HTML tags or attributes. At default all supported HTML tags
are enabled.
For example if you don't want to process all images you should
use option -disable_html_tag 'IMG,SRC;INPUT,SRC;BODY,BACKGROUND'
.
- -subdir $dir
- Subdirectory of local tree directory, to limit some of the modes {sync,
resumeregets, linkupdate} in its tree scan.
- -dont_leave_site/-leave_site
- (Don't) leave starting site. At default pavuk can span host when recursing
through WWW tree.
- -dont_leave_dir/-leave_dir
- (Don't) leave starting directory. If -dont_leave_dir option is used pavuk
will stay only in starting directory (including its own subdirectories).
At default pavuk can leave starting directories.
- -leave_site_enter_dir/-dont_leave_site_enter_dir
- If you are downloading WWW tree which spans multiple hosts with huge
trees, you may want to allow downloading of document which are in
directory hierarchy below directory which we visited as first on each
site. To obtain this, use option -dont_leave_site_enter_dir. As default
pavuk will go also to higher directory levels on that site.
- -lmax $nr
- Set maximum allowed level of tree traverse. Default is set to 0, what
means that pavuk can traverse at infinitum. As of version 0.8pl1 inline
objects of HTML pages are placed at same level as parent HTML page.
- -leave_level $nr
- Maximum level of documents outside from site of starting URL. Default is
set to 0, and 0 means that checking is not applied.
- -site_level $nr
- Maximum level of sites outside from site of starting URL. Default is set
to 0, and 0 means that checking is not applied.
- -dmax $nr
- Set maximum allowed number of documents that are processed. Default value
is 0. That means no restrictions are used in number of processed
documents.
- -singlepage/-nosinglepage
- Using option -singlepage allows you to transfer just HTML pages
with all its inlined objects (pictures, sounds, frame documents, ...). As
default is disabled single page transfer. This option makes -mode
singlepage option obsolete.
- -limit_inlines/-dont_limit_inlines
- With this option you can control whether limiting options apply also to
inline objects (pictures, sounds, ...). This is useful when you want to
download specified set of HTML pages with all inline options without any
restrictions.
- -user_condition $str
- Script or program name for users own conditions. You can write any script
which should with exit value decide if download URL or not. Script gets
from pavuk any number of options, with this meaning :
-url $url - processed URL
-parent $url - any number of parent URLs
-level $nr - level of this URL from starting URL
-size $nr - size of requested URL
-date $datenr - modification time of requested URL in format
YYYYMMDDhhmmss
The exit status 0 of script or program means that current URL
should be rejected and nonzero exit status means that URL should be
accepted.
Warning : use user conditions only if required because of big
slowdowns caused by forking scripts for each checked URL.
- -follow_cmd $str
- This option allows you to specify script or program which can by its exit
status decide whether to follow URLs from current HTML document. This
script will be called after download of each HTML document. The script
will get following options as it's parameters:
-url $url - URL of current HTML document
-infile $file - local file where is stored HTML document
The exit status 0 of script or program means that URLs from
current document will be disallowed, other exit status means, that pavuk
can follow links from current HTML document.
Support for scripting languages like JavaScript or VBScript in pavuk is done bit
hacky way. There is no interpreter for this languages, so not all things will
work. Whole support which pavuk have for this scripting languages is based on
regular expression patterns specified by user. Pavuk search for this patterns
in DOM event attributes of HTML tags, in javascript:... URLs, in inline
scripts in HTML documents enclosed between <script></script> tags
and in separate javascript files. Support for scripting languages is only
available when pavuk is compiled with proper regular expression library
(POSIX/GNU/PCRE).
- -enable_js/-disable_js
- This options are used to enable or disable processing of Javascript parts
of HTML documents. You must enable this option to be able to use
processing of javascript patterns.
- -js_pattern $re
- With this option you are specifying what patterns match interested parts
of Javascript for extracting URLs. The parameter must be RE pattern with
exactly one subpattern which match exactly the URL part. For example to
match URL in following type of javascript expressions :
document.b1.src='pics/button1_pre.jpg'
you can use this pattern
"^document.[a-zA-Z0-9_]*.src[ ]*=[ ]*'(.*)'$"
- -js_transform $p $t $h $a
- This option is similar to previous, but you can use custom transform rules
for the URL parts of patterns and also specify the exact HTML tag and
attribute where to look for this pattern. The $p is the pattern to
match the interested part of script. The $t is transform rule for
the URL, in this parameter the $x parts will be replaced by x-th
subpattern of the $p pattern. The $h parameter is exact HTML
tag or "*" when this apply to javascript: URLs or DOM event
attribs or "" (empty string) when this apply to javascript body
of HTML document or separate JS file. The $a parameter is exact
HTML attrib of tag or "" (empty string) when this rule apply to
javascript body.
- -js_transform2 $p $t $h $a
- This option is very similar to previous. The meaning of all parameters is
same, just the pattern $p can have only one substring which will be
used in the transform rule $t. This is required to allow rewriting
of URL parts of the tags and scripts. This option can also be used to
force pavuk to recognize HTML targ/attribute pairs which pavuk does not
support.
- -cookie_file $file
- File where are stored cookie infos. This file must be in Netscape cookie
file format (generated with Netscape Navigator or Communicator ...).
- -cookie_send/-nocookie_send
- Use collected cookies in HTTP/HTTPS requests. Pavuk will not send at
default cookies.
- -cookie_recv/-nocookie_recv
- Store received cookies from HTTP/HTTPS responses into memory cookie cache.
At default pavuk will not remember received cookies.
- -cookie_update/-nocookie_update
- Update cookie file on disk and synchronize it with changes made by any
concurrent processes. At default pavuk will not update cookie file on
disk.
- -cookies_max $nr
- Maximum number of cookies in memory cookie cache. Default value is 0, and
that means no restrictions for cookies number.
- -disabled_cookie_domains $list
- Comma-separated list of cookie domains which are permitted to send cookies
stored into cookie cache
- -cookie_check/-nocookie_check
- Check when receiving cookie, if cookie domain is equal to domain of server
which sends this cookie. At default pavuk check is server is setting
cookies for its domain, and if it tries to set cookie for foreign domain
pavuk will complain about that and will reject such cookie.
- -noRelocate/-Relocate
- This switch prevents the program to rewrite relative URLs to absolute,
after HTML document is transfered. Default pavuk behavior is to maintain
link consistence of HTML documents. So always when HTML document is
downloaded pavuk will rewrite all URLs to point to local document if it is
available and if it is not available it will point to remote document.
After document is properly downloaded, pavuk will update links in HTML
documents, which point to this one.
- -all_to_local/-noall_to_local
- This option forces pavuk to change all URLs inside HTML document to local
URLs immediately after download of document. Default is this option
disabled.
- -sel_to_local/-nosel_to_local
- This option forces pavuk to change all URLs, which accomplish conditions
for download, to local inside HTML document immediately after download of
document. I recommend to use this option, when you are sure, that transfer
will be without any problems. This option can save a lot of processor
time. Default is this option disabled.
- -all_to_remote/-noall_to_remote
- This option forces pavuk to change all URLs inside HTML document to remote
URLs immediately after download of document. Default is this option
disabled.
- -post_update/-nopost_update
- This option is especially designed to allow in -fnrules option
doing rules based on MIME type of document. This option forces pavuk to
generate local names for documents just after pavuk knows what is the MIME
type of document. This have big impact on the rewriting engine of links
inside HTML documents. This option causes disfunction of other options for
controlling the link rewriting engine. Use this option only when you know
what you are doing :-)
- -dont_touch_url_pattern $pat
- This options serves to deny rewriting and processing of particular URLs in
HTML documents by pavuk HTML rewriting engine. This option accepts
wildcard patterns to specify such URLs. Matching is done against untouched
URLs so when he URL is relative, you must use pattern which matches the
relative URL, when it is absolute, you must use absolute URL.
- -dont_touch_url_rpattern $pat
- This option is variation on previous option. This one uses regular
patterns for matching of URLs instead of wildcard patterns used by
-dont_touch_url_pattern option. This option is available only when
pavuk is compiled with support for regular expression patterns.
- -dont_touch_tag_rpattern $pat
- This option is variation on previous option, just matching is made on full
HTML tag with included <>. This option accepts regular expression
patterns. It is available only when pavuk is compiled with support for
regular expression patterns.
- -tr_del_chr $str
- All characters found in $str will be deleted from local name of
document. $str should contain escape sequences similar like in tr
command:
\n - newline
\r - carriage return
\t - horizontal tab space
\0xXX - hexadecimal ASCII value
[:upper:] - all uppercase letters
[:lower:] - all lowercase letters
[:alpha:] - all letters
[:alnum:] - all letters and digits
[:digit:] - all digits
[:xdigit:] - all hexadecimal digits
[:space:] - all horizontal and vertical whitespace
[:blank:] - all horizontal whitespace
[:cntrl:] - all control characters
[:print:] - all printable characters including space
[:nprint:] - all non printable characters
[:punct:] - all punctation characters
[:graph:] - all printable characters excluding space
- -tr_str_str $str1 $str2
- String $str1 from local name of document will be replaced with
$str2.
- -tr_chr_chr $chrset1 $chrset2
- Characters from $chrset1 from local name of document will be
replaced with corresponding character from $chrset2.
$charset1 and $charset2 should have same syntax as
$str in -tr_del_chr option.
- -store_name $str
- When you want to change local filename of first file downloaded with
singlepage mode, you should use this option.
- -index_name $str
- With this option you can change directory index name. As default is used
_._.html .
- -store_index/-nostore_index
- With option -nostore_index you should deny storing of directory indexes
into HTML files.
- -fnrules $t $m $r
- This is a very powerful option! This option is used to flexible change
layout of local document tree. It accepts three parameters. First
parameter $t is used to say what type is following pattern.
F is used for wildcard pattern (uses fnmatch()) and R
is used for regular expression pattern (using any supported RE
implementation). Second parameter is matching pattern used to select URLs
for this rule. If URL match this pattern, then local name for this URL is
computed following rules of third parameter. And third parameter is local
name building rule. Pavuk now supports two kinds of local name building
rules. One is simple based only on simple macros and other more
complicated extended rule, which also enables to perform several
functions. Recognition between those two kinds of rules is done by looking
at first character of rule. In case when first character is '(',
rule is extended and in all other cases it is the simple kind of rule.
Simple rule should contain literals or escaped macros.
Macros are escaped by % character or by $ character.
Here is list of recognized macros:
$x - where x is any positive number. This macro is
replaced with x-th substring matched by RE pattern. (If you use this you
need to understand RE !)
%i - is replaced with protocol id (http, https, ftp, gopher)
%p - is replaced with password. (use this only when usable)
%u - is replaced with username.
%h - is replaced with host name.
%m - is replaced with domain name.
%r - is replaced with port number.
%d - is replaced with path to document.
%n - is replaced with document name.
%b - is replaced with basename of document (without extension).
%e - is replaced with extension.
%s - is replaced with searchstring.
%M - is replaced with MIME type of document. When you are using
this macro, you *must* use also -post_update option else it won't
work.
%E - is replaced with default extension assigned to MIME type of
document. When you are using this macro, you *must* use also
-post_update option else it won't work.
%x - where x is positive number. This macro is replaced with x-th
directory from path to document from beginning.
%-x - where x is positive number. This macro is replaced with x-th
directory from path to document from end.
Here is example. If you want place document into single
directories by extension, you should use following fnrules option:
-fnrules F '*' '/%e/%n'
Extended rule ever begins with character
´('. It uses some kind of LISP like syntax.
Here are base rules for writing extended rules : - the
local filename of of this kind is return value function
- each function is enclosed inside round braces ()
- first token right after opening brace is function name
- each function have nonzero fixed number of parameters
- each function returns numeric or string value
- function parameters are separated by any number of space characters
- parameter of function should be string, number, macro or other function
- string is ever quoted with "
- each numeric parameter can be in any encoding supported by
strtod() function (octal, decimal, hexadecimal, ...)
- there is no implicit conversion from number to string
- each macro is prefixed by % character and is one character long
- each macro is replaced by its string representation from current URL
- function parameters are typed strictly
- toplevel function must return string value
Extended rule supports full set of % escaped macros
supported with simple rules, plus two following addition macros :
%U - URL string
%o - default localname for URL
Here is description of all supported functions
sc - concat two string parameters
- accepts two string parameters
- returns string value
ss - substring form string
- accepts three parameters.
- first is string from which we want to cut subpart
- second is number which represents starting position in string
- third is number which represents ending position in string
- returns string value
hsh - compute modulo hash value from string with specified base
- accepts two parameters
- first is string for which we are computing the hash value
- second is numeric value for base of modulo hash
- returns numeric value
md5 - compute MD5 checksum for string
- accepts one string value
- returns string which represents MD5 checksum
lo - convert all characters inside string to lower case
- accepts ane string value
- returns string value
up - convert all characters inside string to upper case
- accepts one string value
- returns string value
ue - encode unsafe characters in string with same encoding which is
used for encoding unsafe characters inside URL (%xx) As default
are encoded all nonascii values when this function is used.
- accepts two string values
- first is string which we want to encode
- second is string which contains unsafe characters
- return string value
dc - delete unwanted characters from string (have similar
functionality as -tr_del_chr option)
- accepts two string values
- first is string from which we want delete
- second is string which contains characters we want to delete.
- returns string value
tc - replace character with other character in string (have similar
functionality as -tr_chr_chr option)
- accepts three string values
- first is string inside which we want to replace characters
- second is set of characters which we want to replace
- third is set of characters with which we are replacing
- returns string value
ts - replace some string inside string with any other string (have
similar functionality as -tr_str_str option)
- accepts three string values
- first is string inside which we want to replace string
- second is the from string
- third is to string
- returns string value
spn - calculate initial length of string which contains only
specified set of characters. (have same functionality as strspn()
libc function)
- accepts two string values
- first is input string
- second is set of acceptable characters
- returns numeric value
cspn - calculate initial length of string which doesn't contain
specified set of characters. (have same functionality as
strcspn() libc function)
- accepts two string values
- first is input string
- second is set of unacceptable characters
- returns numeric value
sl - calculate length of string
- accepts one string value
- returns numeric value
ns - convert number to string by format
- accepts two parameters
- first parameter is format string same as for printf() function
- second is number which we want to convert
- returns string value
lc - return position of last occurrence of specified character
inside string
- accepts two string parameters
- first string which we are searching in
- second string contains character for which we are looking for
- returns numeric value
+ - add two numeric values
- accepts two numeric values
- returns numeric value
- - subtract two numeric values
- accepts two numeric values
- returns numeric value
% - modulo addition
- accepts two numeric values
- returns numeric value
* - multiple two numeric values
- accepts two numeric values
- returns numeric value
/ - divide two numeric values
- accepts two numeric values
- returns numeric value
rmpar - remove parameter from query string
- accepts two string
- first string is string which we are adjusting
- second parameter is name of parameter which should be removed
- returns adjusted string
getval - get query string parameter value
- accepts two string
- first string is query string from which to get the parameter
value (usually %s)
- second string is name of parameter for which we want to get
the value
- returns value of the parameter or empty string when the parameter
doesn't exists
sif - logical decision
- accepts three parameters
- first is numeric and when is zero than result of this decision
is result of second parameter, else result is result of third
parameter
- second parameter is string
- third parameter is string
- returns string result of decision
! - logical not
- accepts one numeric parameter
- returns negation of parameter
& - logical and
- accept two numeric parameters
- returns logical and of parameters
| - logical or
- accept two numeric parameters
- returns logical or of parameters
getext - get file extension
- accept one sting (filename or path)
- return string containing extension of parameter
seq - compare two strings
- accepts two strings for comparison
- returns numeric value 0 - if different 1 - if equal
jsf - execute JavaScript function
- accepts one string parameter which holds name of
JavaScript function specified in script loaded with
-js_script_file option.
- returns string value equal to return value of
JavaScript function
- this function is available only when pavuk is compiled
with support for JavaScript bindings
For example, if you are mirroring very huge number of internet
sites into same local directory, too much entries in one directory,
should cause performance problems. You may use for example hsh or
md5 functions to generate one additional level of hash
directories based on hostname whit one of following options :
-fnrules F '*' '(sc (nc "%02d/" (hsh %h 100))
%o)'
-fnrules F '*' '(sc (ss (md5 %h) 0 2) %o)'
- -base_level $nr
- Number of directory levels to omit in local tree.
For example when downloading URL
ftp://ftp.idata.sk/pub/unix/www/pavuk-0.7pl1.tgz you enter at command
line -base_level 4 in local tree will be created
www/pavuk-0.7pl1.tgz not
ftp/ftp.idata.sk_21/pub/unix/www/pavuk-0.7pl1.tgz as normally.
- -default_prefix $str
- Default prefix of mirrored directory. This option is used only when you
are trying to synchronize content of remote directory which was downloaded
using -base_level option. Also you must use directory based
synchronization method, not URL based synchronization method. This is
especially useful, when used in conjunction with -remove_old
option.
- -remove_adv/-noremove_adv
- This option is used for turn on/off of removing HTML tags which contains
advertisement banners. The banners are not removed from HTML file, but are
commented out. Such URLs also will not be downloaded. This option have
effect only when used with option -adv_re. Default is turned off.
This option is available only when your system have support for one of
supported regular expressions implementation.
- -adv_re $RE
- This option is used to specify regular expressions for matching URLs of
advertisement banners. For example : -adv_re http://ad.doubleclick.net/.*
is used to match all files from server ad.doubleclick.net. This option is
available only when your system have any supported regular expressions
implementation.
- -unique_name/-nounique_name
- Pavuk as default always attempts to assign to unique URL unique local
filename. If this behavior is not wanted, you can use option
-nounique_name to disable this.
- -sleep $nr
- This option allows you to specify number of seconds during that the
program will be suspended between two transfers. Useful to deny server
overload. Default value for this option is 0.
- -rsleep/-norsleep
- When this option is active, pavuk randomizes the the sleep time between
transfers in interval between zero and value specified with -sleep
option. Default is this option inactive.
- -ddays $nr
- If document has modification time later as $nr days, then in sync mode
pavuk attempts to retrieve newer copy of document from remote server.
Default value is 0.
- -remove_old/-noremove_old
- Remove improper documents (that, which doesn't exist on remote site). This
option have effect only when used in directory based sync mode.
When used with URL based sync mode, pavuk will not remove any old files
which were excluded from document tree and are not referenced in any HTML
document. You must also use option -subdir, to let pavuk find files
which belongs to current mirror. As default pavuk won't remove any old
files.
- -browser $str
- is used to set your browser command (in URL tree dialog you can use right
click to raise menu, from which you can start browser on actually selected
URL). This option is available only when compiled with GTK GUI and with
support for URL tree preview.
- -debug/-nodebug
- turns on displaying of debug messages. This option is available only when
compiled with -DDEBUG. If -debug option is used pavuk will output verbose
information about documents, whole protocol level information, locking
informations and more (depends on -debug_level setup). This options
is used just like trigger to enable output of debug messages selected by
-debug_level option. Default is debug mode turned off.
- -debug_level $level
- Set level of required debug informations. $level can be numeric
value which represent binary mask for requested debug levels, or comma
separated list of supported debug levels. Currently pavuk supports
following debug levels :
html - for HTML parser debugging
protos - to see server side protocol messages
protoc - to see client side protocol messages
procs - to see some special procedure calls
locks - for debugging of documents locking
net - for debugging some low level network stuff
misc - for miscellaneous unsorted debug messages
user - for verbose user level messages
all - request all currently supported debug levels
mtlock - locking of resources in multithreading environment
mtthr - launching/weaking/sleeping/stoping of threads in
multithreaded environment
protod - for DEBUGGING of POST requests
limits - for debugging limiting options, you will see the reason why
particular URLs are rejected by pavuk and which option caused this.
ssl - to enable verbose reporting about SSL related things.
- -remind_cmd $str
- This option have effect only when running pavuk in reminder mode.
To command specified with this option pavuk sends result of running
reminder mode. There are listed URLs which are changed and URLs which have
any errors. Default remind command is "mailx user@server -s
\"pavuk reminder result\"" .
- -nscache_dir $dir
- Path to Netscape browser cache directory. If you specify this path, pavuk
attempts to find out if you have URL in this cache. If URL is there it
will be fetched else pavuk will download it from net. The cache directory
index file must be named index.db and must be located in the cache
directory. To support this feature, pavuk have to be linked with
BerkeleyDB 1.8x .
- -mozcache_dir $dir
- Path to Mozilla browser cache directory. Same functionality as with
previous option, just for different browser with different cache formats.
Pavuk supports both formats of Mozilla browser disk cache (old for
versions <0.9 and new used in 0.9=<). The old format cache directory
must contain cache directory index database with name cache.db.
Then new format cache directory must contain map file _CACHE_MAP_,
and three block files _CACHE_001_, _CACHE_002_,
_CACHE_003_. To support old Mozilla cache format, pavuk have to be
linked with BerkeleyDB 1.8x. New Mozilla cache format doesn't require any
external library.
- -post_cmd $str
- Post-processing command, which will be executed after successful download
of document. This command may somehow handle with document. During time of
running this command, pavuk leaves actual document locked, so there isn't
chance that some other pavuk process will modify document. This
postprocessing command will get three additional parameters from pavuk.
- local name of document
- 1/0 1 if document is HTML document, 0 if not
- original URL of this document
- -hack_add_index/-nohack_add_index
- This is bit hacky option. It forces pavuk to add to URL queue also
directory indexes of all queued documents. This allow pavuk to download
more documents from site, than it is able achieve in normal traversing of
HTML documents. Bit dirty but useful in some cases.
- -js_script_file $file
- Pavuk have optionally builtin JavaScript interpreter to allow high level
customization of some internal procedures. Currently you are allowed to
customize with your own JavaScript functions two things. You can use it to
set precise limiting options, or you can write own functions which can be
used inside rules of -fnrules option. With this option you can load
JavaScript script with functions into pavuks internal JavaScript
interpreter. To learn more about this capabilities read separate document
jsbind.txt which comes with pavuk sources in toplevel directory. This
option is available only when you have compiled pavuk with support for
JavaScript bindings.
As of version 0.9pl29 pavuk have changed indication of status by exit codes. In
earlier versions exit status 0 was for no error and nonzero exit status was
something like count of failed documents. In all version after 0.0pl29 there
are defined following exit codes:
0 - no error, everything is OK
1 - error in configuration of pavuk options or
error in config files
2 - some error occurred while downloading documents
- USER
- variable is used to construct email address from user and hostname
- LC_* or LANG
- used to set internationalized environment
- PAVUKRC_FILE
- with this variable you can specify alternative location for your pavukrc
configuration file.
- at
- is used for scheduling.
- gunzip
- is used to decode gzip or compress encoded documents.
If you find any, please let me know.
- @SYSCONFDIR@/pavukrc
- ~/.pavukrc
- ~/.pavuk_prefs
-
These files are used as default configuration files. You may
specify there some constant values like your proxy server or your
preferred WWW browser. Configuration options reflect command line
options. Not all parameters are suitable for use in default
configuration file. You should select only some of them, which you
really need.
File ~/.pavuk_prefs is special file which contains
automatically stored configuration. This file is used only when running
GUI interface of pavuk and option -prefs is active.
First (if present) parsed file is @SYSCONFDIR@/pavukrc
then ~/.pavukrc (if present), then ~/.pavuk_prefs (if
present). Last the command line is parsed. The precedence is as follows
:
- highest -
Entered in user interface
Entered in command line
~/.pavuk_prefs
~/.pavukrc
@SYSCONFDIR@/pavukrc
- lowest -
Here is table of config file - command line options pairs.
MaxLevel: ---> -lmax
MaxDocs: ---> -dmax
MaxSize: ---> -maxsize
MinSize: ---> -minsize
SleepBetween: ---> -sleep
MaxRetry: ---> -retry
MaxRegets: ---> -nregets
MaxRedirections: ---> -nredirs
CommTimeout: ---> -timeout
RegetRollbackAmount: ---> -rollback
DocExpiration: ---> -ddays
UseCache: ---> -nocache
UseRobots: ---> -noRobots
AllowFTP: ---> -noFTP
AllowHTTP: ---> -noHTTP
AllowSSL: ---> -noSSL
AllowGopher: ---> -noGopher
AllowCGI: ---> -noCGI
AllowGZEncoding: ---> -noEnc
AllowFTPRecursion: ---> -FTPdir
ForceReget: ---> -force_reget
Debug: ---> -debug
AllowedSites: ---> -asite
DisallowedSites: ---> -dsite
AllowedDomains: ---> -adomain
DisallowedDomains: ---> -ddomain
AllowedPrefixes: ---> -aprefix
DisallowedPrefixes: ---> -dprefix
AllowedSuffixes: ---> -asfx
DisallowedSuffixes: ---> -dsfx
AllowedMIMETypes: ---> -amimet
DisallowedMIMETypes: ---> -dmimet
PreferredLanguages: ---> -alang
PreferredCharset: ---> -acharset
WorkingDir: ---> -cdir
WorkingSubDir: ---> -subdir
HTTPAuthorizationScheme: ---> -auth_scheme
HTTPAuthorizationName: ---> -auth_name
HTTPAuthorizationPassword: ---> -auth_passwd
AuthReuseDigestNonce: ---> -auth_reuse_nonce
SSLCertPassword: ---> -ssl_cert_passwd
SSLCertFile: ---> -ssl_cert_file
SSLKeyFile: ---> -ssl_key_file
EmailAddress: ---> -from
MatchPattern: ---> -pattern
REMatchPattern: ---> -rpattern
SkipMatchPattern: ---> -skip_pattern
SkipREMatchPattern: ---> -skip_rpattern
URLMatchPattern: ---> -url_pattern
URLREMatchPattern: ---> -url_rpattern
SkipURLMatchPattern: ---> -skip_url_pattern
SkipURLREMatchPattern: ---> -skip_url_rpattern
DefaultMode: ---> -mode
FTPProxy: ---> -ftp_proxy
HTTPProxy: ---> -http_proxy
SSLProxy: ---> -ssl_proxy
GopherProxy: ---> -gopher_proxy
FTPViaHTTPProxy: ---> -ftp_httpgw
GopherViaHTTPProxy: ---> -gopher_httpgw
HTTPProxyUser: ---> -http_proxy_user
HTTPProxyPass: ---> -http_proxy_pass
HTTPProxyAuth: ---> -http_proxy_auth
AuthReuseProxyDigestNonce: ---> -auth_reuse_proxy_nonce
Browser: ---> -browser
ScenarioDir: ---> -scndir
ShowProgress: ---> -progress
XMaxLogSize: ---> -xmaxlog
LogFile: ---> -logfile
RemoveOldDocuments: ---> -remove_old
AuthFile: ---> -auth_file
BaseLevel: ---> -base_level
FTPDirtyProxy: ---> -ftp_dirtyproxy
ActiveFTPData: ---> -ftp_active/-ftp_passive
ActiveFTPPortRange: ---> -active_ftp_port_range
AlwaysMDTM: ---> -always_mdtm/-noalways_mdtm
RemoveBeforeStore: ---> -(no)remove_before_store
ShowDownloadTime: ---> -stime
NLSMessageCatalogDir: ---> -msgcat
Quiet: ---> -quiet/-verbose
NewerThan: ---> -newer_than
OlderThan: ---> -older_than
Reschedule: ---> -reschedule
DontLeaveSite: ---> -dont_leave_site/-leave_site
DontLeaveDir: ---> -dont_leave_dir/-leave_dir
PreserveTime: ---> -preserve_time/-nopreserve_time
LeaveLevel: ---> -leave_level
GUIFont: ---> -gui_font
UserCondition: ---> -user_condition
CookieFile: ---> -cookie_file
CookieSend: ---> -cookie_send/-nocookie_send
CookieRecv: ---> -cookie_recv/-nocookie_recv
CookieUpdate: ---> -cookie_update/-nocookie_update
CookiesMax: ---> -cookies_max
CookieCheckDomain: ---> -cookie_check/-nocookie_check
DisabledCookieDomains: ---> -disabled_cookie_domains
DisableHTMLTag: ---> -disable_html_tag
EnableHTMLTag: ---> -enable_html_tag
TrDeleteChar: ---> -tr_del_chr
TrStrToStr: ---> -tr_str_str
TrChrToChr: ---> -tr_chr_chr
IndexName: ---> -index_name
StoreName: ---> -store_name
PreservePermisions: ---> -preserve_perm/-nopreserve_perm
PreserveAbsoluteSymlinks: ---> -preserve_slinks/-nopreserve_slinks
FTPListCMD: ---> -FTPlist/-noFTPlist
MaxRate: ---> -maxrate
MinRate: ---> -minrate
ReadBufferSize: ---> -bufsize
BgMode: ---> -bg/-nobg
CheckSize: ---> -check_size/-nocheck_size
SLogFile: ---> -slogfile
Identity: ---> -identity
SendFromHeader: ---> -send_from/-nosend_from
RunX: ---> -runX
FnameRules: ---> -fnrules
StoreDocInfoFiles: ---> -store_info/-nostore_info
AllLinksToLocal: ---> -all_to_local/-noall_to_local
AllLinksToRemote: ---> -all_to_remote/-noall_to_remote
SelectedLinksToLocal: ---> -sel_to_local/-nosel_to_local
ReminderCMD: ---> -remind_cmd
AutoReferer: ---> -auto_referer/-noauto_referer
URLsFile: ---> -urls_file
UsePreferences: ---> -prefs/-noprefs
FTPhtml: ---> -FTPhtml/-noFTPhtml
StoreDirIndexFile: ---> -store_index/-nostore_index
Language: ---> -language
FileSizeQuota: ---> -file_quota
TransferQuota: ---> -trans_quota
FSQuota: ---> -fs_quota
EnableJS: ---> -enable_js/-disable_js
UrlSchedulingStrategy: ---> -url_strategy
NetscapeCacheDir: ---> -nscache_dir
RemoveAdvertisement: ---> -remove_adv/-noremove_adv
AdvBannerRE: ---> -adv_re
CheckIfRunnigAtBackground: ---> -check_bg/-nocheck_bg
SendIfRange: ---> -send_if_range/-nosend_if_range
SchedulingCommand: ---> -sched_cmd
UniqueLogName: ---> -unique_log/-nounique_log
PostCommand: ---> -post_cmd
SSLVersion: ---> -ssl_version
UniqueSSLID: ---> -unique_sslid/-nounique_sslid
AddHTTPHeader: ---> -httpad
StatisticsFile: ---> -statfile
WaitOnExit: ---> -ewait
AllowedIPAdrressPattern: ---> -aip_pattern
DisallowedIPAdrressPattern:---> -dip_pattern
SiteLevel: ---> -site_level
UseHTTP11: ---> -use_http11
MaxRunTime: ---> -max_time
LocalIP: ---> -local_ip
RequestInfo: ---> -request
HashSize: ---> -hash_size
NumberOfThreads: ---> -nthreads
ImmediateMessages: ---> -immesg/-noimmsg
HTMLFormData: ---> -formdata
DumpFD: ---> -dumpfd
DumpUrlFD: ---> -dump_urlfd
DeleteAfterTransfer: ---> -del_after/-nodel_after
UniqueDocName: ---> -unique_name/-nounique_name
LeaveSiteEnterDirectory: ---> -leave_site_enter_dir/-dont_leave_site_enter_dir
SinglePage: ---> -singlepage/-nosinglepage
NTLMAuthorizationDomain: ---> -auth_ntlm_domain
NTLMProxyAuthorizationDomain:
---> -auth_proxy_ntlm_domain
JavascriptPattern: ---> -js_pattern
FollowCommand: ---> -follow_cmd
RetrieveSymlinks: ---> -retrieve_symlink/-noretrieve_symlink
JSTransform: ---> -js_transform
JSTransform2: ---> -js_transform2
FTPProxyUser: ---> -ftp_proxy_user
FTPProxyPassword: ---> -ftp_proxy_pass
LimitInlineObjects: ---> -limit_inlines/-dont_limit_inlines
FTPListOptions: ---> -ftp_list_options
FixWuFTPDBrokenLISTcmd: ---> -fix_wuftpd_list/-nofix_wuftpd_list
PostUpdate: ---> -post_update/-nopost_update
SeparateInfoDir: ---> -info_dir
MozillaCacheDir: ---> -mozcache_dir
AllowedPorts: ---> -aport
DisallowedPorts: ---> -dport
HackAddIndex: ---> -hack_add_index/-nohack_add_index
JavaScriptFile: ---> -js_script_file
FtpLoginHandshake: ---> -ftp_login_handshake
NSSCertDir: ---> -nss_cert_dir
NSSAcceptUnknownCert: ---> -nss_accept_unknown_cert/-nonss_accept_unknown_cert
NSSDomesticPolicy: ---> -nss_domestic_policy/-nss_export_policy
DontTouchUrlREPattern: ---> -dont_touch_url_rpattern
DontTouchUrlPattern: ---> -dont_touch_url_pattern
DontTouchTagREPattern: ---> -dont_touch_tag_rpattern
HTMLTagPattern: ---> -tag_pattern
HTMLTagREPattern: ---> -tag_rpattern
URL: ---> one URL (more lines with URL:
... means more URL's)
line which begins with '#' means comment.
TrStrToStr: and TrChrToChr: must contain two quoted strings. All parameter names
are case insensitive. If here is missing any option, try to look inside
config.c source file.
See pavukrc.sample file for example
- .pavuk_authinfo
-
File should contain as many authentification records as you
need. Records are separated by any number of empty lines. Parameter name
is case insensitive.
Structure of record:
Proto: <proto ID> ---> identification of protocol
(ftp/http/https/..)
- required field
Host: <host:[port]> ---> host name
- required field
User: <user> ---> name of user
- optional
Pass: <password> ---> password for user
- optional
Base: <path> ---> base prefix of document path
- optional
Realm: <name> ---> realm for HTTP authorization
- optional
NTLMDomain: <domain> ---> NT/LM domain for NTLM authorization
- optional
Type: <type> ---> HTTP authentification scheme
- 1/user - user auth scheme
- 2/Basic - Basic auth scheme (default)
- 3/Digest - Digest auth scheme
- 4/NTLM - NTLM auth scheme
- optional
- see pavuk_authinfo.sample file for example
- ~/.pavuk_keys
- this is file where are stored information about configurable menu option
shortcuts. This is available only when compiled with Gtk+1.2 and
higher.
- ~/.pavuk_remind_db
- this file contains informations about URLs for running in reminder
mode. Structure of this file is very easy. Each line contains information
abou one URL. first entry in line is last known modification time of URL
(stored in time_t format - number of secons from 1.1.1970 GMT). And second
entry is URL.
pavuk -mode mirror -nobg -store_info -info_dir
/mirror/info -nthreads 1 -cdir /mirror/incoming -subdir
/mirror/incoming -preserve_time -nopreserve_perm
-nopreserve_slinks -noretrieve_symlink -force_reget
-noRobots -trans_quota 16384 -maxsize 16777216
-max_time 28 -nodel_after -remove_before_store -ftpdir
-ftplist -ftp_list_options -a -dont_leave_site
-dont_leave_dir -all_to_local -remove_old -nostore_index
-active_ftp_port_range 57344:65535 -always_mdtm
-ftp_passive -base_level 2 http://<my_host>/doc/
Look into ChangeLog file for more informations about new features in
particular versions of pavuk.
Main development Ondrejicka Stefan
Look into CREDITS file of sources for additional information.
pavuk is available from http://pavuk.sourceforge.net/
Visit the GSP FreeBSD Man Page Interface. Output converted with ManDoc. |