# Linux/Mac/WSL
# Open a terminal
ssh you25usr@ssh.bangor.ac.uk
# enter your bangor password and complete MFA
# from bangor ssh server, log into hawk
ssh b.you25usr@hawklogin.cf.ac.uk
# ! advanced option: all-in-one
ssh -J you25usr@ssh.bangor.ac.uk b.you25usr@hawklogin.cf.ac.uk
Unix-like (linux and Mac) generally use a bash shell (or something like it).
“Bash is a command processor that typically runs in a text window where the user types commands that cause actions. Bash can also read and execute commands from a file, called a shell script. Like most Unix shells, it supports filename globbing (wildcard matching), piping, here documents, command substitution, variables, and control structures for condition-testing and iteration.”
See https://en.wikipedia.org/wiki/Bash_(Unix_shell)
# print working directory
pwd
# make directory
mkdir my_directory
# change directory
cd my_directory
# print working directory
pwd
# print something to screen
echo "hello"
# redirect output to file
echo "hello" > hello.txt
# list contents of directory
ls
# help menu
ls --help
# view contents of file
cat
less
head
# print something else to file
echo "hello2" > hello.txt
echo "hello" > hello.txt
echo "hello2" >> hello.txt
# count lines and characters
wc -l
wc -c
# search for content in files
grep --help
grep "2" hello.txt
# pipes
grep "2" hello.txt | wc -l
# deleting (CAREFUL!!!)
rm hello.txt
# let's make some more files
yes hello | head -n 100 > my_file1.txt
yes goodbye | head -n 100 > my_file2.txt
# wildcards
cat my_file* > my_combined_file.txt
# a note on file extensions
mv my_combined_file2.txt my_combined_file2.mp3
mv my_combined_file2.txt my_combined_file2
# deleting with wildcards (VERY CAREFUL!!!)
rm my*
# Download file from internet
wget https://dnazoo.s3.wasabisys.com/Alligator_mississippiensis/ASM28112v4_HiC.fasta.gz
# Note file is compressed with gzip compression
# view contents without unzipping
zcat ASM28112v4_HiC.fasta.gz | less
# uncompress
gunzip ASM28112v4_HiC.fasta.gz
# note you can compress with the command gzip
# log off
exit
# We assume you are in /home/hawk_username/my_directory. Check!
pwd
# let's make a file of random numbers
# we can use the bash variable $RANDOM (more on variables later)
# write 10 random numbers to a file
echo $RANDOM >> my_randoms.txt
# sort the numbers
sort my_randoms.txt
# check for duplicates
sort my_randoms.txt | uniq -c
# stream editing: sed
# see https://www.gnu.org/software/sed/manual/sed.html
# sed 's/search_string/replace_string/g' inputfile
# s = substitute
# g = global
sed --help
sed 's/1/Axel/g' my_randoms.txt
# something more complex
# . all characters
# \0 [replace with] the same character
sed 's/./\0 /g' my_randoms.txt
# extracting the scaffold
head -n 5576894 ASM28112v4_HiC.fasta | tail -n 5576893 > scaffold_1.txt
# Axel's solution using sed. Slow!
# sed 's/./\0\n/g' enters newline after each character
sed 's/./\0\n/g' scaffold_1.txt | sort | uniq -c
# Jeronimo et al's method using greps, faster!
# -o print all matches
# -i case insensitive
grep -o -i a scaffold_1.txt
grep -o -i t scaffold_1.txt
grep -o -i g scaffold_1.txt
grep -o -i c scaffold_1.txt
# Ruby's method using tr, Fastest!
# -c complement, find opposite of match
# -d delete
tr -cd 'Aa' < scaffold_1.txt | wc -c
tr -cd 'Tt' < scaffold_1.txt | wc -c
tr -cd 'Gg' < scaffold_1.txt | wc -c
tr -cd 'Cc' < scaffold_1.txt | wc -c
.sh
by conventionbash (as opposed to
python, perl, R, etc.)# are comments#!/bin/bash
# let's make some files
yes hello | head -n 100 > my_file1.txt
yes goodbye | head -n 100 > my_file2.txt
# combine files
cat my_file* > my_combined_file.txt
echo "job finished"
vim# open file
vim my_script.sh
# enter insert mode
i
# do stuff ######################################
# exit insert mode
[ESC]
# write changes to file
:w
# quit
:q
sh my_script.sh
bash script described above and
executebash, variables can be used to store data in
memoryR$$1, $2, $3, etc.variable=value#!/bin/bash
# script makes lists of 2 words, which can be entered when the script is called
# $count controls the length of the list
# call the script like:
# sh my_script.sh hello goodbye
# declare variables
count=100
# main script
# let's make some files
yes $1 | head -n $count > my_file1.txt
yes $2 | head -n $count > my_file2.txt
# combine files
cat my_file* > my_combined_file.txt
echo "job finished"
#!/bin/bash
x=0
while [ $x -lt 10 ]; do
echo The counter is $x
let x=x+1
done
# print working directory
pwd
# list contents of working directory
ls
# move up one level in hierarchy
cd ..
# print working directory
pwd
# list contents of working directory
ls
# move to root directory
cd /
# list contents of working directory
ls
# Go home
cd ~
# copying files
cp <source> <destination_file>
cp sheep.txt bla/sheep_in_bla.txt
# on hawk: make a small file
echo "scp'ing is awesome!" > copy.txt
# check the small file
cat copy.txt
# open a new terminal from your local PC & log into the ssh jump host
ssh you25usr@ssh.bangor.ac.uk
# copy the file from hawk to bangor ssh server
scp b.you25usr@hawklogin.cf.ac.uk:/home/b.you25usr/copy.txt .
# log out of bangor ssh server
exit
# copy the file from bangor ssh server to your local PC
scp you25usr@ssh.bangor.ac.uk:/home/people/you25usr/copy.txt .
# for WSL users: if you move the file to your Windows filesystem, you can edit in Notepad etc
# your windows filesystem can be found in the WSL in the following path:
/mnt/c/Users/your_windows_username/
# !Advanced! all-in-one
scp -o ProxyJump=you25usr@ssh.bangor.ac.uk b.you25usr@hawklogin.cf.ac.uk:/home/b.you25usr/copy.txt .
# start in your home
cd
# Copy the contents of this file:
/scratch/scw2141/oooooo/its/such/a/long/filepatch/paths_of_glory.txt
# To a file named with your name, here:
/scratch/scw2141/and/did/he/use/caPITAls/what/a/masochist/
# module searching
module avail
module avail R
# module load
module load R/4.2.0
# module unload
module purge
# You are currently using the log in node.
# Do not run resource-intensive jobs on the log in node!
# use slurm to open an active bash shell on a compute node
srun --nodes=1 --ntasks=4 --mem=4G --time=0-00:20 --pty /bin/bash
# the queue
squeue
squeue -u username
# do something resource intensive
module load pigz
pigz -p 4 ASM28112v4_HiC.fasta
# exit
exit
# check efficiency
seff jobid
slurm
job scriptslurm using #SBATCH#!/bin/bash --login
###
#job name
#SBATCH --job-name=example
#job stdout file
#SBATCH --output=/path/to/dir/example%J
#job stderr file
#SBATCH --error=/path/to/dir/example_err%J
#maximum job time in D-HH:MM
#SBATCH --time=0-00:01
#number of nodes
#SBATCH --nodes=1
#number of parallel processes (tasks)
#SBATCH --ntasks=1
#memory in Gb
#SBATCH --mem=1G
echo hello > hello.txt
# submit to queue
sbatch job_script.sh
#command line
sbatch account=scw1234 job_script.sh
# or include the following SBATCH in the script
#assign account
#SBATCH account=scw1234
home:
my_slurm_scripts and output_logsslurm script in
./my_slurm_scripts./output_logsNext
slurm script.slurm script in
./my_slurm_scriptshome directory./output_logs# enter BEARCAVE
cd /scratch/scw2141/BEARCAVE2/
# view directories
ll
# raw data
cd rawdata
# look at a fastq file
zcat ./adder01/a01+adder01_R1.fastq.gz | less
# look at metadata
cat ./metadata.txt
# look at reference genome
less /scratch/scw2141/BEARCAVE2/refgenomes/berus_chr7/berus_chr7.fa
NOTE: if you haven’t been able to finish this section when we move on, just continue with the next section and we will make sure there’s time at the end of the day to get you caught up.
less BEARCAVE2/scripts/trim_merge_DS_PE_standard2_JP.sh
#!/bin/bash
# Hacked by Axel, April 2023
# For use on Hawk which has shiny modules, using new cutadapt and pigz to allow multithreading
# you will need to load the following modules in your slurm script:
# module load cutadapt/4.3 FLASH/1.2.11 pigz/2.4
# November 2018
# script for trimming adapter sequences from DS library PE data, removing short reads, amd merging overlapping PE reads.
# this script is for data sequenced using the standard Illumina R1 sequencing primer
# this script should be run from /BEARCAVE/scripts
# example command line: sh ./trim_merge_DS_PE_standard2.sh $1 $2 $3
# $1 = SAMPLE*
# $2 = PREFIX*
# $3 = SEQ_RUN*
# * these can be found in /BEARCAVE/rawdata/metadata.txt
slurm job script in your chosen location within
home directory. It should do the following steps.#set working directory
#SBATCH --chdir=/scratch/scw2141/BEARCAVE2/scripts/
less BEARCAVE2/scripts/map_modern_PE_mem2_tmp_JP.sh
#!/bin/bash
# Hacked by Axel, April 2023
# For use on Hawk which has shiny modules, using new bwa and samtools
# this uses the improved duplicate removal method (not rmdup)
# you will need to load the following modules in your slurm script:
# module load bwa/0.7.17 samtools/1.17 pigz/2.4
# November 2018
# script for mappping modern DNA reads to a reference genome
# note this includes mapping of both merged and unmerged paired-end reads
# this script should be run from /BEARVACE/scripts
# example command line: sh ./map_modern_PE_mem2.sh $1 $2 $3 $4
# $1 = prefix or prefixes (if you are mapping combined data file, you should enter its while set of prefixes as one argument, like this: pr1_pr2_pr3
# $2 = reference - folder name in /refgenomes/
# $3 = 3 character taxon identifier, used for filenaming - e.g. arc, spe, kud, mar.
# $4 = sample name
/scratch/scw2141/BEARCAVE2/trimdata//scratch/scw2141/BEARCAVE2/trimdata/trimlogsWe will carry out population genomics analyses using
angsd. This requires a list of bamfiles (with full paths)
to analyse. For example:
# angsd requires a list on bam files in a text file
ls -1 /scratch/scw2141/BEARCAVE2/mappedberus_chr7/*bam > bamlist_outgroup
# load angsd module
module purge
module load angsd/0.935
# set ref genome as a variable
ref='/scratch/scw2141/BEARCAVE2/refgenomes/berus_chr7/berus_chr7.fa'
# angsd command
#-doIBS,-doCounts = Identity by state, currently set as single read sampling
#-doCov = calculate covariance matrix
#-makeMatrix = calculate distance matrix
#-doMajorMinor, -GL = identify major/minor allele using genotype likelihoods
#-minFreq = min allele frequency for site
#-minInd = min individuals with data
#-minQ,-minMapQ = min base and map quality
#-b = specify bamlist
#-ref = reference genome
#-out = name outputs
#-nThreads = number threads
# this calculates the distance matrix, include the outgroup
angsd -doIBS 1 -makeMatrix 1 -doCounts 1 -doMajorMinor 1 -GL 1 -minFreq 0.1 -minInd 48 -minQ 30 -minMapQ 30 -b bamlist_outgroup -ref $ref -out bamlist_outgroup -nThreads 10
# load angsd module
module purge
module load angsd/0.935
# set ref genome as a variable
ref='/scratch/scw2141/BEARCAVE2/refgenomes/berus_chr7/berus_chr7.fa'
# load modules and declare variables as above
# this calculates the covariance matrix, exclude the outgroup
angsd -doIBS 1 -doCov 1 -doCounts 1 -doMajorMinor 1 -GL 1 -minFreq 0.1 -minInd 47 -minQ 30 -minMapQ 30 -b bamlist_ingroup -ref $ref -out bamlist_ingroup -nThreads 10
# set files as variables
input='/scratch/scw2141/BEARCAVE2/mappedberus_chr7/xxx+adderxx_ber_berus_chr7_.xxxxx.bam'
output='adderxx_het'
# We need to specify the ancestral allele, use fasta generated from bosniensis outgroup
bos='/scratch/scw2141/bosniensis_chr7/bosniensis_chr7.fa'
# depth variables
max=20
min=3
# estimate genotype likelihoods
# -i = input file
# -doSaf = per-site allele frequencies
# -GL = genotype likelihood model
# -P = number threads
# -anc = outgroup, to identify ancestral allele
#-minQ,-minMapQ = min base and map quality
# -setMinDepthInd, -setMaxDepthInd = max and min depth filters
angsd -i $input -doSaf 1 -GL 1 -P 10 -anc $bos -out $output -minMapQ 30 -minQ 30 -setMinDepthInd $min -setMaxDepthInd $max -doCounts 1
# calculate sfs
# -nSites = sliding window size
# -P = no threads
realSFS $output.saf.idx -nSites 100000 -P 10 > $output.window.sfs
R commandsDownload the lizard_data.txt file to your working directory (use right-click “Save link as…”):
# R is just a big calculator
1+1
# make an object
my_sum <- 1+1
# vectors
my_vec <- c(1,2,3,4)
# functions
length(my_vec)
# plotting
my_vec2 <- c(1,2,3,4) # geenrate data
plot(my_vec, my_vec2)
plot(c(1,2,3,4), c(4,3,2,1))
# sample from a normal distribution
rnorm(500, 10, 1)
# histogram
hist(rnorm(500, 10, 1))
# read in data as a dataframe
my_liz <- read.table("lizard_data.txt", header=TRUE)
# isolate svl data
my_liz$svl
# and we can make this into a self-contained script too...
# ...and also run this on the SCW
/scratch/scw2141/addeRs/ to you home directory and execute
the lizards.R script on hawkdownload the following 3 files to your working directory (use right-click “Save link as…”):
Download adder_distribution.shx
Download adder_distribution.shp
# read in the data
adders <- read.table("adder_info.txt", header=T, sep='\t')
# retrieve map
map <- getMap(resolution = "low")
# load libraries
library(rworldmap)
# plot simple map
plot(map, col="grey", border=NA)
# load libraries
library(sf)
# add IUCN distribution as SHP file on the map
iucn <- st_read("adder_distribution.shx")
plot(st_geometry(iucn), add=TRUE, border=NA, col="slategrey")
# add samples
points(adders$lon,adders$lat)
plot() function/scratch/scw2141/addeRs/)You can use your own angsd output files or the ones here:
Download bamlist_ingroup.covMat
Download bamlist_outgroup.ibsMat
Download adder01_het.window.sfs
# read in covariance matrix
my_cov <- as.matrix(read.table("./assets/data/bamlist_ingroup.covMat"))
# do PCA
my_pca <- eigen(my_cov)
# plot
plot(my_pca$vectors[,1], my_pca$vectors[,2])
# we use the ape library
library(ape)
# read in matrix
my_mat <- as.matrix(read.table("./assets/data/bamlist_outgroup.ibsMat"))
# compute tree
my_tree <- nj(my_mat)
# plot
my_root <- root(my_tree, outgroup=48)
plot(my_root)
my_sfs <- read.table("./assets/data/adder01_het.window.sfs")
hist(my_sfs$V2/100)
plot(1:length(my_sfs$V2), my_sfs$V2/100)
Each plot is in its most basic form. Spend some time trying to improve each one to your liking