html

HTML Scraping, Screen Scraping

Written by  on Oktober 31, 2019
#!/bin/bash

URL="https://example.com/bla.html"
FILE="/tmp/in"
OUT="/tmp/out"
TARGET="/path/to/file/output.html"
wget "$URL" -O $FILE

echo "<html><head></head><body>" > $OUT
echo "<h1>" >> $OUT
echo "$(tr -d '\n' <$FILE)"  | awk -F"h1" '{print $6 }' | cut -d'>' -f2 | cut -d '<' -f1 >> $OUT
echo "</h1>" >> $OUT
echo "$(tr -d '\n' <$FILE)"  | awk -F"table"  '{print "<table "$5"table>" }' >> $OUT
echo "</body></html>" >> $OUT
iconv -f utf-8 -t ascii//TRANSLIT  < $OUT  > $TARGET