PullOSTree.sh

From WBITT's Cooker!

Revision as of 10:50, 30 May 2011 by Kamran (Talk | contribs)
Jump to: navigation, search
#!/bin/bash
# Author: Muhammad Kamran Azeem (kamran@wbitt.com)
# Created: 30 May 2011
# Revised: 30 May 2011
# Summary: Pulls the structure (and files) of the CENTOS OS repository, for which the URL is provided.
#          With or without RPMs. 
######################################################################################################

######## User configuration - Start #######
# ARCH can be i386 or x86_64

ARCH="i386"

# GETRPMS can be set to "Y" or "N". 
# Setting GETRPMS to "Y" will pull all the (4+ GB) RPMs inside CentOS directory of the distribution.
# Setting GETRPMS to "N", will of-course just pull the OS tree without the RPMs. About 230 MB in total.

GETRPMS="N"

# Check list of available URLs: http://www.centos.org/modules/tinycontent/index.php?id=30

URL="http://centos.mirror.nexicom.net/5.6/os/${ARCH}/"

# Note: You need to adjust the value for --cut-dirs, based on the number (count) of directories in the URL above
#       In the URL above, there are 3 directories after FQDN. That is why --cut-dirs is set to 3 below.

CUTDIRS=3

######## User configuration - End #######


if [ "$1" == "" ] ; then
  echo "Must provide target top level directory for the OS to be stored. e.g. /data/cdimages/CentOS-5.6-i386/"
  echo "You can set ARCH and source URL in the script itself."
  exit 1
else
  TARGETDIRECTORY="$1"
fi

if [ ${GETRPMS} == "N" ] ; then

  # Note 1: 
  # --exclude-directories option in wget wants a full path, not one relative to the download URL.
  # So if you are downloading from http://ayo.freshrpms.net/fedora/linu.../RPMS.updates/
  # Use e.g: --exclude-directories=fedora/linux/3/i386/RPMS.updates/debug,fedora/linux/3/i386/RPMS.updates/headers

  # Note 2: 
  # --exclude-directories will not create the directory matching the exclude filter. 
  # Thus, if you use this method, you will need to create a CentOS directory inside the ${TARGETDIRECTORY}
  # So use either: EXCLUDEOPTIONS="--exclude-directories=5.6/os/${ARCH}/CentOS"
  # Or this: EXCLUDEOPTIONS="--reject=*.rpm"
  # Don't use any quotes around *.rpm , or it will not work.

  EXCLUDEOPTIONS="--reject=*.rpm"

else
  EXCLUDEOPTIONS=""
fi

COMMAND="wget --mirror ${URL}  --no-parent --no-host-directories ${EXCLUDEOPTIONS} --reject=index.html* --cut-dirs=${CUTDIRS} -P ${TARGETDIRECTORY}"

echo "Now executing: ${COMMAND}"

${COMMAND}

echo "Finished executing: ${COMMAND}"

# Clean up. Delete the index.html garbage from the Target Directory:
# No need of this clean up, as  --reject=index.html*  will take care of it automatically.
# find ${TARGETDIRECTORY} -name "index.html*" -exec rm '{}' ';' 

exit $?

#########################################################################################################################################################
Explanation of important switches from wget man page:
-----------------------------------------------------
       -m
       --mirror
           Turn on options suitable for mirroring.  This option turns on recursion and time-stamping, sets infinite recursion depth and keeps FTP
           directory listings.  It is currently equivalent to -r -N -l inf --no-remove-listing.

       -X list
       --exclude-directories=list
           Specify a comma-separated list of directories you wish to exclude from download.  Elements of list may contain wildcards.

       -R rejlist --reject rejlist
           Specify comma-separated lists of file name suffixes or patterns to accept or reject. Note that if any of the wildcard characters, *, ?, [ or
           ], appear in an element of acclist or rejlist, it will be treated as a pattern, rather than a suffix.

       -np
       --no-parent
           Do not ever ascend to the parent directory when retrieving recursively.  This is a useful option, since it guarantees that only the files
           below a certain hierarchy will be downloaded.

       -nH
       --no-host-directories
           Disable generation of host-prefixed directories.  By default, invoking Wget with -r http://fly.srk.fer.hr/ will create a structure of
           directories beginning with fly.srk.fer.hr/.  This option disables such behavior.

       -P prefix
       --directory-prefix=prefix
           Set directory prefix to prefix.  The directory prefix is the directory where all other files and subdirectories will be saved to, i.e. the top
           of the retrieval tree.  The default is . (the current directory).

       --cut-dirs=number
           Ignore number directory components.  This is useful for getting a fine-grained control over the directory where recursive retrieval will be
           saved.

           Take, for example, the directory at ftp://ftp.xemacs.org/pub/xemacs/.  If you retrieve it with -r, it will be saved locally under
           ftp.xemacs.org/pub/xemacs/.  While the -nH option can remove the ftp.xemacs.org/ part, you are still stuck with pub/xemacs.  This is where
           --cut-dirs comes in handy; it makes Wget not "see" number remote directory components.  Here are several examples of how --cut-dirs option
           works.

                   No options        -> ftp.xemacs.org/pub/xemacs/
                   -nH               -> pub/xemacs/
                   -nH --cut-dirs=1  -> xemacs/
                   -nH --cut-dirs=2  -> .

                   --cut-dirs=1      -> ftp.xemacs.org/xemacs/
                   ...

           If you just want to get rid of the directory structure, this option is similar to a combination of -nd and -P.  However, unlike -nd,
           --cut-dirs does not lose with subdirectories---for instance, with -nH --cut-dirs=1, a beta/ subdirectory will be placed to xemacs/beta, as one
           would expect.

Personal tools