# !/usr/bin/ksh -p while getopts f:l:d: opt ; do case $opt in f) first_url=$OPTARG ;; l) last_url=$OPTARG ;; d) output_dir=$OPTARG ;; *) echo "\nUsage: ${0##*/} <-f first_url> [-l last_url]" exit 1 ;; esac done [[ -z $last_url ]] && last_url=$first_url [[ -z $output_dir ]] && output_dir=$PWD # # Sina book example: # # http://vip.book.sina.com.cn/book/chapter_79460_47740.html # |< ---- prefix ------ >|page|suffix| # if [[ ${first_url%_*} != ${last_url%_*} ]]; then echo "Check the URL to make sure it is the same book!" exit 1 fi prefix=${first_url%_*} suffix=${first_url##*.} first_chapter_num=${first_url#${prefix}_} first_chapter_num=${first_chapter_num%.$suffix} last_chapter_num=${last_url#${prefix}_} last_chapter_num=${last_chapter_num%.$suffix} typeset -i n=1 while (( first_chapter_num <= last_chapter_num )); do echo "============== Starting Get Page No.$n ============" # wget the source html file html_file=${prefix}_${first_chapter_num}.${suffix} wget $html_file --directory-prefix=${output_dir} html_file=$output_dir/${html_file##*/} # figure out chapter name chapter_name=$(grep "

" $html_file) chapter_name=${chapter_name##*

} chapter_name=${chapter_name%%

*} # add serial number for each page to avoid confussion chapter_name=$(printf "${output_dir}/%03d${chapter_name}.txt" $n) # filter out content head and tail tag head="

" middle="<\/p>

" tail="<\/p><\/div>" grep "$head" $html_file > /tmp/foo1 eval "sed 's/$head//' < /tmp/foo1 > /tmp/foo2" eval "sed 's/$middle//g' < /tmp/foo2 > /tmp/foo1" eval "sed 's/$tail//' < /tmp/foo1 > $chapter_name" rm -f $html_file /tmp/foo1 /tmp/foo2 (( first_chapter_num = first_chapter_num + 1 )) (( n = n + 1 )) done echo "==================== DONE ==================="