Tile generation of big data
Project description
AWS_ES_DOMAIN=52.23.165.123:9872
# Create tiles from coolers
workon py3
assembly=hg19
FILENAME=/Dixon2015-H1_hESC-HindIII-allreps-filtered.5kb.cool
#for FILENAME in Dixon2015-H1_NP-HindIII-allreps-filtered.50kb.cool Dixon2015-H1_NP-HindIII-allreps-filtered.5kb.cool;
for FILENAME in Rao2014-NHEK-MboI-allreps-filtered.50kb.cool Rao2014-K562-MboI-allreps-filtered.50kb.cool Rao2014-IMR90-MboI-allreps-filtered.50kb.cool Rao2014-HUVEC-MboI-allreps-filtered.50kb.cool Rao2014-HMEC-MboI-allreps-filtered.50kb.cool Rao2014-GM12878-MboI-allreps-filtered.50kb.cool;
do
DATASET_NAME=${assembly}/${FILENAME}
FILEPATH=/data/coolers/${DATASET_NAME}
python ~/projects/cooler/scripts/dump_matrix_txt.py ${FILEPATH} --balanced --join --out - | awk '{ if (NF == 6) print $1 "\t" $2 "\t" $4 "\t" $5 "\t" 0; else print $1 "\t" $2 "\t" $4 "\t" $5 "\t" $7; }' | grep -v start1 | chr_pos_to_genome_pos.py -c 1,2:3,4 -a $assembly | make_triangular.py | sort -k1,1n -k2,2n - | gzip > ${FILEPATH}.genome.sorted.gz
done;
AWS_ES_DOMAIN=52.45.229.11:9872
ASSEMBLY=hg19
RESOLUTION=50000
for DATASET_NAME in Rao2014-NHEK-MboI-allreps-filtered.50kb.cool.genome.sorted.gz Rao2014-K562-MboI-allreps-filtered.50kb.cool.genome.sorted.gz Rao2014-IMR90-MboI-allreps-filtered.50kb.cool.genome.sorted.gz Rao2014-HUVEC-MboI-allreps-filtered.50kb.cool.genome.sorted.gz Rao2014-HMEC-MboI-allreps-filtered.50kb.cool.genome.sorted.gz Rao2014-GM12878-MboI-allreps-filtered.50kb.cool.genome.sorted.gz;
do
INDEX_NAME=${ASSEMBLY}/${DATASET_NAME}
#INDEX_NAME=${DATASET_NAME,,}/tiles
echo $INDEX_NAME
FILENAME=coolers/${ASSEMBLY}/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}
#curl -XDELETE "http://${AWS_ES_DOMAIN}/${DATASET_NAME,,}"
zcat ${FILEPATH} | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly ${ASSEMBLY} -b 256 -r ${RESOLUTION} --elasticsearch-url ${AWS_ES_DOMAIN}/${INDEX_NAME} --num-threads 4 --triangular --log-file clodius.log # 16:48:26
done;
###################################################################################################################################
# Creating autocomplete tiles
#AWS_ES_DOMAIN=52.23.165.123:9872
# wget https://raw.githubusercontent.com/pkerpedjiev/gene-citation-counts/master/all_gene_counts.tsv
# mv all_gene_counts.tsv ~/data/genbank-data/human/
AWS_ES_DOMAIN=52.45.229.11:9872
workon py3
ASSEMBLY=hg19
python scripts/make_autocomplete_list.py -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count ~/data/hg19/genbank-output/refgene-count/ -n geneName -i count --elasticsearch-url ${AWS_ES_DOMAIN}/${ASSEMBLY}.autocomplete --reverse-importance
################################################################################################################################
# Gene Density Data
FILEPATH=~/data/encode/hg19/gencodeDensity
awk '{ print $1, $2, $1, $3, $4}' ${FILEPATH}.bedGraph | chr_pos_to_genome_pos.py -c 1,2:3,4 -a hg19 | /usr/bin/time sort -k1,1n -k2,2n - | gzip > ${FILEPATH}.genome.sorted.gz
AWS_ES_DOMAIN=52.23.165.123:9872
ASSEMBLY=hg19
RESOLUTION=1
DATASET_NAME=gencodeDensity.genome.sorted.gz
INDEX_NAME=${ASSEMBLY}/${DATASET_NAME}
#INDEX_NAME=${DATASET_NAME,,}/tiles
#echo $INDEX_NAME
FILENAME=encode/${ASSEMBLY}/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}
##curl -XDELETE "http://${AWS_ES_DOMAIN}/${DATASET_NAME,,}"
zcat ${FILEPATH} | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly ${ASSEMBLY} -b 256 -r ${RESOLUTION} --expand-range 1,2 --ignore-0 -k 1 -v 3 --elasticsearch-url ${AWS_ES_DOMAIN}/${INDEX_NAME} --log-file clodius.log --max-queue-size 2000 --print-status # 16:48:26
################################################################################################################################
### Max's Data
IDENTIFIER=UMB5144
AWS_ES_DOMAIN=52.23.165.123:9872
ASSEMBLY=hg19
BASE_PATH=~/data/clodius-input/hg19
FILEPATH=${BASE_PATH}/${IDENTIFIER}.all_bins.tsv
INDEX_NAME=${ASSEMBLY}/${ASSEMBLY}.${IDENTIFIER}.all_bins.sorted.genome.gz
RESOLUTION=1000
################### Prepare
rm ${FILEPATH}
for i in $(seq -f "%02g" 21) X Y;
do
pv -cN chr${i} ${BASE_PATH}/${IDENTIFIER}/b1000/UMB5144-final-${i}.b1000.bin | grep -v start | awk -v chrom=${i} '{ gsub("^0*", "", chrom); print "chr" chrom "\t" $1 "\t" "chr" chrom "\t" $2 "\t" $3 "\t" $4 "\t" $5 }' | chr_pos_to_genome_pos.py -c 1,2:3,4 -a hg19 >> ${FILEPATH}
done;
pv -cN sorting... ${FILEPATH} | sort -k1,1n -k2,2n - | gzip > ${FILEPATH}.genome.sorted.gz
################### Tile Ratios
AWS_ES_DOMAIN=52.23.165.123:9872
zcat ${FILEPATH}.genome.sorted.gz | head -n 1000 | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly ${ASSEMBLY} -b 256 -r ${RESOLUTION} --expand-range 1,2 -k 1 -v 3 --columnfile-path /tmp/maxs-tiles --log-file clodius.log --max-queue-size 2000 --print-status # 16:48:26
pv -cN tiling ${FILEPATH}.genome.sorted.gz | zcat | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly ${ASSEMBLY} -b 256 -r ${RESOLUTION} --expand-range 1,2 -k 1 -v 3,4 --elasticsearch-url ${AWS_ES_DOMAIN}/${INDEX_NAME} --log-file clodius.log --max-queue-size 2000
############# Tile CNV calls
DATASET_NAME=UMB5144__B1000_l03.BICseq.out
CNV_PATH=${BASE_PATH}/${IDENTIFIER}/b1000/lambda_03/${DATASET_NAME}
pv -cN chr_pos_to_genome_pos $CNV_PATH | grep -v 'start' | awk '{print $1, $2, $1, $3, $4, $5, $6, $7, $8}' | chr_pos_to_genome_pos.py -c 1,2:3,4 -a hg19 > ${CNV_PATH}.genome
INPUT_FILE=${CNV_PATH}.genome
AWS_ES_DOMAIN=52.45.229.11:9872
ASSEMBLY=hg19
INDEX_NAME=${ASSEMBLY}/${DATASET_NAME}
/usr/bin/time python scripts/make_tiles.py --elasticsearch-nodes ${AWS_ES_DOMAIN} --elasticsearch-path $INDEX_NAME -v count --position start --end-position end -c chrom,start,end,binNum,observed,expected,log2_copyRatio,pValue --max-zoom 18 -i pValue --importance --max-entries-per-tile 16 --assembly ${ASSEMBLY} ${INPUT_FILE}
################################################################################################################################
### Cooler to tiles
python scripts/cooler_to_tiles.py /data/tmp/UNTR.1kb.multires.cool --assembly mm9 --max-zoom 3 --elasticsearch-url 52.23.165.123:9872/hg19.1/mm9.UNTR.1kb.multires.cool
################################################################################################################################
### Tile BigWig
workon py2
python scripts/tile_bigWig.py ~/data/clodius-input/hg19/E014-H3K27me3.fc.signal.bigwig --assembly hg19
################################################################################################################################
###### Gene Annotations
AWS_ES_DOMAIN=52.23.165.123:9872
INPUT_FILE=~/data/hg19/genbank-output/refgene-count-minus
DATASET_NAME=hg19/refgene-tiles-minus
OUTPUT_DIR=~/data/${DATASET_NAME}
/usr/bin/time python scripts/make_tiles.py --elasticsearch-nodes ${AWS_ES_DOMAIN} --elasticsearch-path $DATASET_NAME -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 18 -i count --importance --reverse-importance --max-entries-per-tile 16 $INPUT_FILE
###
#
AWS_ES_DOMAIN=52.23.165.123:9872
ASSEMBLY=hg19
RESOLUTION=50000
DATASET_NAME=Dixon2015-H1_hESC-HindIII-allreps-filtered.50kb.cool.unbalanced.genome.sorted.gz
INDEX_NAME=${ASSEMBLY}/${DATASET_NAME}
#INDEX_NAME=${DATASET_NAME,,}/tiles
echo $INDEX_NAME
FILENAME=coolers/${ASSEMBLY}/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}
#curl -XDELETE "http://${AWS_ES_DOMAIN}/${DATASET_NAME,,}"
zcat ${FILEPATH} | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly ${ASSEMBLY} -b 256 -r ${RESOLUTION} --elasticsearch-url ${AWS_ES_DOMAIN}/${INDEX_NAME} --num-threads 4 --triangular --log-file clodius.log --max-queue-size 2000 # 16:48:26
AWS_ES_DOMAIN=52.23.165.123:9872
ASSEMBLY=hg19
RESOLUTION=5000
DATASET_NAME=Dixon2015-H1_hESC-HindIII-allreps-filtered.5kb.cool.unbalanced.genome.sorted.gz
INDEX_NAME=${ASSEMBLY}/${DATASET_NAME}
#INDEX_NAME=${DATASET_NAME,,}/tiles
echo $INDEX_NAME
FILENAME=coolers/${ASSEMBLY}/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}
#curl -XDELETE "http://${AWS_ES_DOMAIN}/${DATASET_NAME,,}"
zcat ${FILEPATH} | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly ${ASSEMBLY} -b 256 -r ${RESOLUTION} --elasticsearch-url ${AWS_ES_DOMAIN}/${INDEX_NAME} --num-threads 4 --triangular --log-file clodius.log --max-queue-size 2000 # 16:48:26
### Gene density
ASSEMBLY=hg19
DATASET_NAME=gencodeDensity.bedGraph.txt
INDEX_NAME=hg19.1/${DATASET_NAME}
FILENAME=encode/${ASSEMBLY}/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}
python scripts/process_file.py --assembly hg19 --type bedgraph $FILEPATH
#zcat ~/data/clodius-input/hg19/geneDensity3.bedGraph.txt.genome.sorted.gz | head -n 50 | /usr/bin/time pypy scripts/make_single_threaded_tiles.py --min-pos 1 --max-pos 10000000 -b 256 -r 1 --expand-range 1,2 --ignore-0 -k 1 -v 3 --elasticsearch-url 52.23.165.123:9872/hg19.1/geneDensity3.bedgraph.txt.genome.sorted.gz.1
zcat ${FILEPATH}.genome.sorted.gz | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly hg19 -b 256 -r 1 --expand-range 1,2 --ignore-0 -k 1 -v 3 --elasticsearch-url 52.23.165.123:9872/${INDEX_NAME} --print-status
### Gene Information
AWS_ES_DOMAIN=54.197.186.181:9872
INPUT_FILE=~/data/hg19/genbank-output/refgene-count-minus
INDEX_NAME=hg19/refgene-tiles-minus
OUTPUT_DIR=~/data/${DATASET_NAME}
curl -XDELETE "http://${AWS_ES_DOMAIN}/${INDEX_NAME}/_query" -d '{
"query" : {
"match_all" : {}
}
}'
/usr/bin/time python scripts/make_tiles.py --elasticsearch-nodes ${AWS_ES_DOMAIN} --elasticsearch-path $INDEX_NAME -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 18 -i count --importance --reverse-importance --max-entries-per-tile 16 --assembly hg19 ${INPUT_FILE}
#/usr/bin/time python scripts/make_tiles.py --elasticsearch-nodes ${AWS_ES_DOMAIN} --elasticsearch-path $INDEX_NAME -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 18 -i count --importance --reverse-importance --max-entries-per-tile 16 --assembly hg19 /tmp/perm2
### Nometonome
AWS_ES_DOMAIN=52.23.165.123:9872
ASSEMBLY=GCF_000005845.2_ASM584v2_genomic
cat ~/projects/nometonome/contacts/GCF_000005845.2_ASM584v2_genomic.22.contacts.genome /usr/bin/time pypy scripts/make_single_threaded_tiles.py --min-pos 1,1 --max-pos 4641652,4641652 -b 256 -r 1 -v 3 --elasticsearch-url 52.23.165.123:9872/hg19.1/GCF_000005845.2_ASM584v2_genomic.22.contacts.genome
[
{
"chromInfoPath": "//s3.amazonaws.com/pkerp/data/hg19/chromInfo.txt",
"domain": [
0,
4641652
],
"viewStyle": {
"float": "left",
"padding": "5px",
"width": "100%"
},
"tracks": [
{
"source": "//52.23.165.123:9872/hg19.1/GCF_000005845.2_ASM584v2_genomic.22.contacts.genome",
"type": "heatmap",
"height": 300
}
],
"zoomLock": 0
}
]
AWS_ES_DOMAIN=52.23.165.123:9872
ASSEMBLY=hg19
DATASET_NAME=refGeneBed.bedGraph.txt.genome.sorted.gz
INDEX_NAME=hg19.x/${DATASET_NAME}
FILEPATH=/data/encode/${ASSEMBLY}/${DATASET_NAME}
zcat ${FILEPATH} | /usr/bin/time pypy scripts/make_single_threaded_tiles.py --assembly hg19 -b 256 -r 1 --expand-range 1,2 --ignore-0 -k 1 -v 3 --elasticsearch-url ${AWS_ES_DOMAIN}/${INDEX_NAME}
##########################################################################################
### HiC data
#########################################################################################
### Smaller test set
### Real data set
FILENAME=rao_et_al/HMEC/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved
FILENAME=rao_et_al/HMEC/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved
FILENAME=rao_et_al/HUVEC/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved
FILENAME=rao_et_al/IMR90/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved
FILENAME=rao_et_al/GM12878_primary/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved
DATASET_NAME=hg19/Dixon2015-H1hESC_ES-HindIII-allreps-filtered.1kb.genome.gz
FILENAME=coolers/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}
zcat $FILEPATH > ${FILEPATH}.mirrored
zcat $FILEPATH | awk '{ print $2 "\t" $1 "\t" $3; }' >> ${FILEPATH}.mirrored
head -n 40000000 ${FILEPATH}.mirrored.shuffled > ${FILEPATH}.short
#SPARK_HOME_DIR=/Users/peter/Downloads/spark-1.6.1
#SPARK_HOME_DIR=/home/ubuntu/apps/spark-1.6.1-bin-hadoop2.6
SPARK_HOME_DIR=~/spark-home
/usr/bin/time ${SPARK_HOME_DIR}/bin/spark-submit scripts/make_tiles.py -v count -p pos1,pos2 -c pos1,pos2,count -i count -r 1000 -b 256 --max-zoom 20 --output-format dense --use-spark ${FILEPATH}.short --elasticsearch-nodes localhost:9200 --elasticsearch-path test_shorter/tiles
# Run locally
# OUTPUT_DIR=${FILEPATH}.short.tiles; rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time python scripts/make_tiles.py -o $OUTPUT_DIR -v count -p pos1,pos2 -c pos1,pos2,count -i count -r 1000 -b 256 --max-zoom 20 --output-format dense ${FILEPATH}.short
/usr/bin/time ${SPARK_HOME_DIR}/bin/spark-submit scripts/make_tiles.py -v count -p pos1,pos2 -c pos1,pos2,count -i count -r 1000 -b 256 --max-zoom 20 --output-format dense --use-spark --elasticsearch-nodes localhost:9200 --elasticsearch-path ${DATASET_NAME} ${FILEPATH}.mirrored.shuffled
#OUTPUT_DIR=${FILEPATH}.tiles; rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time ${SPARK_HOME_DIR}/bin/spark-submit scripts/make_tiles.py -o $OUTPUT_DIR -v count -p pos1,pos2 -c pos1,pos2,count -i count -r 1000 -b 256 --max-zoom 20 --output-format dense --use-spark ${FILEPATH}.mirrored
#find $OUTPUT_DIR -name "*.json" | xargs chmod a+r
aws s3 sync --region us-west-2 ~/data/${FILENAME}.tiles s3://pkerp/data/${FILENAME}.tiles
##########################################################################################
### Gene annotations
#########################################################################################
ASSEMBLY=mm9
AWS_ES_DOMAIN=52.23.165.123:9872
DATASET_NAME=refgene-tiles-minus
INPUT_FILE=~/data/${ASSEMBLY}/genbank-output/refgene-count-minus
INDEX_NAME=hg19.1/${ASSEMBLY}.${DATASET_NAME}
/usr/bin/time python scripts/make_tiles.py --elasticsearch-nodes ${AWS_ES_DOMAIN} --elasticsearch-path $INDEX_NAME -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 18 -i count --importance --reverse-importance --max-entries-per-tile 16 $INPUT_FILE
DATASET_NAME=refgene-tiles-plus
INPUT_FILE=~/data/${ASSEMBLY}/genbank-output/refgene-count-plus
INDEX_NAME=hg19.1/${ASSEMBLY}.${DATASET_NAME}
/usr/bin/time python scripts/make_tiles.py --elasticsearch-nodes ${AWS_ES_DOMAIN} --elasticsearch-path $INDEX_NAME -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 18 -i count --importance --reverse-importance --max-entries-per-tile 16 $INPUT_FILE
#rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time python scripts/make_tiles.py -o $OUTPUT_DIR -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 5 -i count --importance --reverse-importance --max-entries-per-tile 16 $INPUT_FILE
#rsync -avzP $OUTPUT_DIR/ ~/projects/goomba/.tmp/jsons/${DATASET_NAME}
#rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time python scripts/make_tiles.py -o $OUTPUT_DIR -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 5 -i count --importance --reverse-importance --max-entries-per-tile 16 $INPUT_FILE
#rsync -avzP $OUTPUT_DIR/ ~/projects/goomba/.tmp/jsons/${DATASET_NAME}
#aws s3 sync $OUTPUT_DIR s3://pkerp/$OUTPUT_PART
#aws s3 sync $OUTPUT_DIR s3://pkerp/$OUTPUT_PART
##########################################################################################
## Wiggle Tracks from
## BEDGraph files
##########################################################################################
DATASET_NAME=hg19/E116-DNase.fc.signal.bigwig
FILENAME=ENCODE/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}
#bigWigToBedGraph ${FILEPATH} ${FILEPATH}.bedGraph # 60 seconds
#cat ${FILEPATH}.bedGraph | awk '{print $1,$2,$1,$3,$4}' | chr_pos_to_genome_pos.py -e 4 > ${FILEPATH}.bedGraph.genome
head -n 1000000 ${FILEPATH}.bedGraph.genome > ${FILEPATH}.short
SPARK_HOME_DIR=~/spark-home
OUTPUT_DIR=${FILEPATH}.short.tiles; rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time ${SPARK_HOME_DIR}/bin/spark-submit scripts/make_tiles.py -v value -c pos1,pos2,value --position pos1 --range pos1,pos2 --range-except-0 value -i value --resolution 1 --bins-per-dimension 64 --max-zoom 20 --use-spark ${FILEPATH}.short -o $OUTPUT_DIR
aws s3 sync $OUTPUT_DIR s3://pkerp/data/served/$DATASET_NAME
#OUTPUT_DIR=${FILEPATH}.short.tiles; rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time ${SPARK_HOME_DIR}/bin/spark-submit scripts/make_tiles.py -v value -c chrom,pos1,pos2,value --position pos1 --range pos1,pos2 -i value --resolution 1 --bins-per-dimension 64 --max-zoom 20 --use-spark ${FILEPATH}.short --elasticsearch-nodes localhost:9200 --elasticsearch-path test_short/bed
SPARK_HOME_DIR=~/spark-home
DATASET_NAME=sample_data/E116-DNase.fc.signal.bigwig.bedGraph.genome.100000
FILEPATH=test/${DATASET_NAME}
OUTPUT_DIR=${FILEPATH}.tiles; rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time ${SPARK_HOME_DIR}/bin/spark-submit scripts/make_tiles.py -v value -c pos1,pos2,value --position pos1 --range pos1,pos2 --range-except-0 value -i value --resolution 1 --bins-per-dimension 64 --max-zoom 5 --use-spark ${FILEPATH} -o $OUTPUT_DIR
rsync -avzP $OUTPUT_DIR/ ~/projects/goomba/.tmp/jsons/${DATASET_NAME}
#aws s3 sync $OUTPUT_DIR s3://pkerp/data/served/${DATASET_NAME}.tiles
## Small file
OUTPUT_DIR=output
rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR;
python scripts/make_tiles.py -o $OUTPUT_DIR -v value -c chrom,pos1,pos2,value --range pos1,pos2 -i value test/data/smallBedGraph.tsv --delimiter ' ' --position pos1 --resolution 1 --max-zoom 14 --output-format dense --bins-per-dimension 128
### Real file
OUTPUT_DIR=~/data/ENCODE/2016-05-16-GM12878-RNASeq/tiles
/usr/bin/time spark-submit --driver-memory 8G scripts/make_tiles.py -o $OUTPUT_DIR -v value -c chrom,pos1,pos2,value --range pos1,pos2 -i value --position pos1 --resolution 1 --max-zoom 14 --output-format dense --bins-per-dimension 128 ~/data/ENCODE/2016-05-16-GM12878-RNASeq/ENCFF000FAA_chr1.bedGraph --use-spark
aws s3 sync --region us-west-2 $OUTPUT_DIR s3://pkerp/data/ENCODE/2016-05-16-GM12878-RNASeq/tiles
## BAM files
samtools view -h data/bam/GM12878_SRR1658581_10pc_3_R1_hg19.bwt2glob.bam | head -n 65536 | samtools view -Sb > data/bam/65536.bam
####
Turn off logging in log4j.properties. Place the log4j.properties file in ~/.spark-conf and point spark to that directory:
export SPARK_CONF_DIR=~/.spark-conf
#### Create ElasticSearch mapping
curl -XGET "http://127.0.0.1:9200/test_short/_optimize"
curl -XGET "http://127.0.0.1:9200/test_short/_mapping"
curl -XGET "http://127.0.0.1:9200/test_short/_stats"
curl -XDELETE "http://search-es4dn-z7rzz4kevtoyh5pfjkmjg5jsga.us-east-1.es.amazonaws.com/hg19/Dixon2015-H1hESC_ES-HindIII-allreps-filtered.1kb.genome.gz.mirrored.shuffled/"
curl -XGET "http://search-es4dn-z7rzz4kevtoyh5pfjkmjg5jsga.us-east-1.es.amazonaws.com/hg19/Dixon2015-H1hESC_ES-HindIII-allreps-filtered.1kb.genome.gz.mirrored.shuffled/14.21.12077"
curl -XGET "search-es4dn-z7rzz4kevtoyh5pfjkmjg5jsga.us-east-1.es.amazonaws.com/hg19/Dixon2015-H1hESC_ES-HindIII-allreps-filtered.1kb.genome.gz.mirrored.shuffled/_search" -d '
{
"query" : {
"match_all" : {}
}
}'
curl -XDELETE "http://127.0.0.1:9200/hg19"
curl -XPUT "localhost:9200/hg19" -d '
curl -XDELETE "http://search-higlass-ssxwuix6kow3sekyeresi7ay5e.us-east-1.es.amazonaws.com/hg19"
curl -XPUT "http://search-higlass-ssxwuix6kow3sekyeresi7ay5e.us-east-1.es.amazonaws.com/hg19" -d '
{
"mappings": {
"_default_": {
"dynamic_templates": [
{ "notanalyzed": {
"match": "*",
"mapping": {
"index": "no"
}
}
}
]
}
}
}'
#########################################################################################
#### Preparing test data
###########################################################################################
head -n 20212 ~/data/ENCODE/hg19/E116-DNase.fc.signal.bigwig.bedGraph.genome > test/sample_data/E116-DNase.fc.signal.bigwig.bedGraph.genome.20212
head -n 100000 ~/data/ENCODE/hg19/E116-DNase.fc.signal.bigwig.bedGraph.genome > test/sample_data/E116-DNase.fc.signal.bigwig.bedGraph.genome.100000
# Create tiles from coolers
workon py3
assembly=hg19
FILENAME=/Dixon2015-H1_hESC-HindIII-allreps-filtered.5kb.cool
#for FILENAME in Dixon2015-H1_NP-HindIII-allreps-filtered.50kb.cool Dixon2015-H1_NP-HindIII-allreps-filtered.5kb.cool;
for FILENAME in Rao2014-NHEK-MboI-allreps-filtered.50kb.cool Rao2014-K562-MboI-allreps-filtered.50kb.cool Rao2014-IMR90-MboI-allreps-filtered.50kb.cool Rao2014-HUVEC-MboI-allreps-filtered.50kb.cool Rao2014-HMEC-MboI-allreps-filtered.50kb.cool Rao2014-GM12878-MboI-allreps-filtered.50kb.cool;
do
DATASET_NAME=${assembly}/${FILENAME}
FILEPATH=/data/coolers/${DATASET_NAME}
python ~/projects/cooler/scripts/dump_matrix_txt.py ${FILEPATH} --balanced --join --out - | awk '{ if (NF == 6) print $1 "\t" $2 "\t" $4 "\t" $5 "\t" 0; else print $1 "\t" $2 "\t" $4 "\t" $5 "\t" $7; }' | grep -v start1 | chr_pos_to_genome_pos.py -c 1,2:3,4 -a $assembly | make_triangular.py | sort -k1,1n -k2,2n - | gzip > ${FILEPATH}.genome.sorted.gz
done;
AWS_ES_DOMAIN=52.45.229.11:9872
ASSEMBLY=hg19
RESOLUTION=50000
for DATASET_NAME in Rao2014-NHEK-MboI-allreps-filtered.50kb.cool.genome.sorted.gz Rao2014-K562-MboI-allreps-filtered.50kb.cool.genome.sorted.gz Rao2014-IMR90-MboI-allreps-filtered.50kb.cool.genome.sorted.gz Rao2014-HUVEC-MboI-allreps-filtered.50kb.cool.genome.sorted.gz Rao2014-HMEC-MboI-allreps-filtered.50kb.cool.genome.sorted.gz Rao2014-GM12878-MboI-allreps-filtered.50kb.cool.genome.sorted.gz;
do
INDEX_NAME=${ASSEMBLY}/${DATASET_NAME}
#INDEX_NAME=${DATASET_NAME,,}/tiles
echo $INDEX_NAME
FILENAME=coolers/${ASSEMBLY}/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}
#curl -XDELETE "http://${AWS_ES_DOMAIN}/${DATASET_NAME,,}"
zcat ${FILEPATH} | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly ${ASSEMBLY} -b 256 -r ${RESOLUTION} --elasticsearch-url ${AWS_ES_DOMAIN}/${INDEX_NAME} --num-threads 4 --triangular --log-file clodius.log # 16:48:26
done;
###################################################################################################################################
# Creating autocomplete tiles
#AWS_ES_DOMAIN=52.23.165.123:9872
# wget https://raw.githubusercontent.com/pkerpedjiev/gene-citation-counts/master/all_gene_counts.tsv
# mv all_gene_counts.tsv ~/data/genbank-data/human/
AWS_ES_DOMAIN=52.45.229.11:9872
workon py3
ASSEMBLY=hg19
python scripts/make_autocomplete_list.py -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count ~/data/hg19/genbank-output/refgene-count/ -n geneName -i count --elasticsearch-url ${AWS_ES_DOMAIN}/${ASSEMBLY}.autocomplete --reverse-importance
################################################################################################################################
# Gene Density Data
FILEPATH=~/data/encode/hg19/gencodeDensity
awk '{ print $1, $2, $1, $3, $4}' ${FILEPATH}.bedGraph | chr_pos_to_genome_pos.py -c 1,2:3,4 -a hg19 | /usr/bin/time sort -k1,1n -k2,2n - | gzip > ${FILEPATH}.genome.sorted.gz
AWS_ES_DOMAIN=52.23.165.123:9872
ASSEMBLY=hg19
RESOLUTION=1
DATASET_NAME=gencodeDensity.genome.sorted.gz
INDEX_NAME=${ASSEMBLY}/${DATASET_NAME}
#INDEX_NAME=${DATASET_NAME,,}/tiles
#echo $INDEX_NAME
FILENAME=encode/${ASSEMBLY}/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}
##curl -XDELETE "http://${AWS_ES_DOMAIN}/${DATASET_NAME,,}"
zcat ${FILEPATH} | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly ${ASSEMBLY} -b 256 -r ${RESOLUTION} --expand-range 1,2 --ignore-0 -k 1 -v 3 --elasticsearch-url ${AWS_ES_DOMAIN}/${INDEX_NAME} --log-file clodius.log --max-queue-size 2000 --print-status # 16:48:26
################################################################################################################################
### Max's Data
IDENTIFIER=UMB5144
AWS_ES_DOMAIN=52.23.165.123:9872
ASSEMBLY=hg19
BASE_PATH=~/data/clodius-input/hg19
FILEPATH=${BASE_PATH}/${IDENTIFIER}.all_bins.tsv
INDEX_NAME=${ASSEMBLY}/${ASSEMBLY}.${IDENTIFIER}.all_bins.sorted.genome.gz
RESOLUTION=1000
################### Prepare
rm ${FILEPATH}
for i in $(seq -f "%02g" 21) X Y;
do
pv -cN chr${i} ${BASE_PATH}/${IDENTIFIER}/b1000/UMB5144-final-${i}.b1000.bin | grep -v start | awk -v chrom=${i} '{ gsub("^0*", "", chrom); print "chr" chrom "\t" $1 "\t" "chr" chrom "\t" $2 "\t" $3 "\t" $4 "\t" $5 }' | chr_pos_to_genome_pos.py -c 1,2:3,4 -a hg19 >> ${FILEPATH}
done;
pv -cN sorting... ${FILEPATH} | sort -k1,1n -k2,2n - | gzip > ${FILEPATH}.genome.sorted.gz
################### Tile Ratios
AWS_ES_DOMAIN=52.23.165.123:9872
zcat ${FILEPATH}.genome.sorted.gz | head -n 1000 | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly ${ASSEMBLY} -b 256 -r ${RESOLUTION} --expand-range 1,2 -k 1 -v 3 --columnfile-path /tmp/maxs-tiles --log-file clodius.log --max-queue-size 2000 --print-status # 16:48:26
pv -cN tiling ${FILEPATH}.genome.sorted.gz | zcat | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly ${ASSEMBLY} -b 256 -r ${RESOLUTION} --expand-range 1,2 -k 1 -v 3,4 --elasticsearch-url ${AWS_ES_DOMAIN}/${INDEX_NAME} --log-file clodius.log --max-queue-size 2000
############# Tile CNV calls
DATASET_NAME=UMB5144__B1000_l03.BICseq.out
CNV_PATH=${BASE_PATH}/${IDENTIFIER}/b1000/lambda_03/${DATASET_NAME}
pv -cN chr_pos_to_genome_pos $CNV_PATH | grep -v 'start' | awk '{print $1, $2, $1, $3, $4, $5, $6, $7, $8}' | chr_pos_to_genome_pos.py -c 1,2:3,4 -a hg19 > ${CNV_PATH}.genome
INPUT_FILE=${CNV_PATH}.genome
AWS_ES_DOMAIN=52.45.229.11:9872
ASSEMBLY=hg19
INDEX_NAME=${ASSEMBLY}/${DATASET_NAME}
/usr/bin/time python scripts/make_tiles.py --elasticsearch-nodes ${AWS_ES_DOMAIN} --elasticsearch-path $INDEX_NAME -v count --position start --end-position end -c chrom,start,end,binNum,observed,expected,log2_copyRatio,pValue --max-zoom 18 -i pValue --importance --max-entries-per-tile 16 --assembly ${ASSEMBLY} ${INPUT_FILE}
################################################################################################################################
### Cooler to tiles
python scripts/cooler_to_tiles.py /data/tmp/UNTR.1kb.multires.cool --assembly mm9 --max-zoom 3 --elasticsearch-url 52.23.165.123:9872/hg19.1/mm9.UNTR.1kb.multires.cool
################################################################################################################################
### Tile BigWig
workon py2
python scripts/tile_bigWig.py ~/data/clodius-input/hg19/E014-H3K27me3.fc.signal.bigwig --assembly hg19
################################################################################################################################
###### Gene Annotations
AWS_ES_DOMAIN=52.23.165.123:9872
INPUT_FILE=~/data/hg19/genbank-output/refgene-count-minus
DATASET_NAME=hg19/refgene-tiles-minus
OUTPUT_DIR=~/data/${DATASET_NAME}
/usr/bin/time python scripts/make_tiles.py --elasticsearch-nodes ${AWS_ES_DOMAIN} --elasticsearch-path $DATASET_NAME -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 18 -i count --importance --reverse-importance --max-entries-per-tile 16 $INPUT_FILE
###
#
AWS_ES_DOMAIN=52.23.165.123:9872
ASSEMBLY=hg19
RESOLUTION=50000
DATASET_NAME=Dixon2015-H1_hESC-HindIII-allreps-filtered.50kb.cool.unbalanced.genome.sorted.gz
INDEX_NAME=${ASSEMBLY}/${DATASET_NAME}
#INDEX_NAME=${DATASET_NAME,,}/tiles
echo $INDEX_NAME
FILENAME=coolers/${ASSEMBLY}/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}
#curl -XDELETE "http://${AWS_ES_DOMAIN}/${DATASET_NAME,,}"
zcat ${FILEPATH} | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly ${ASSEMBLY} -b 256 -r ${RESOLUTION} --elasticsearch-url ${AWS_ES_DOMAIN}/${INDEX_NAME} --num-threads 4 --triangular --log-file clodius.log --max-queue-size 2000 # 16:48:26
AWS_ES_DOMAIN=52.23.165.123:9872
ASSEMBLY=hg19
RESOLUTION=5000
DATASET_NAME=Dixon2015-H1_hESC-HindIII-allreps-filtered.5kb.cool.unbalanced.genome.sorted.gz
INDEX_NAME=${ASSEMBLY}/${DATASET_NAME}
#INDEX_NAME=${DATASET_NAME,,}/tiles
echo $INDEX_NAME
FILENAME=coolers/${ASSEMBLY}/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}
#curl -XDELETE "http://${AWS_ES_DOMAIN}/${DATASET_NAME,,}"
zcat ${FILEPATH} | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly ${ASSEMBLY} -b 256 -r ${RESOLUTION} --elasticsearch-url ${AWS_ES_DOMAIN}/${INDEX_NAME} --num-threads 4 --triangular --log-file clodius.log --max-queue-size 2000 # 16:48:26
### Gene density
ASSEMBLY=hg19
DATASET_NAME=gencodeDensity.bedGraph.txt
INDEX_NAME=hg19.1/${DATASET_NAME}
FILENAME=encode/${ASSEMBLY}/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}
python scripts/process_file.py --assembly hg19 --type bedgraph $FILEPATH
#zcat ~/data/clodius-input/hg19/geneDensity3.bedGraph.txt.genome.sorted.gz | head -n 50 | /usr/bin/time pypy scripts/make_single_threaded_tiles.py --min-pos 1 --max-pos 10000000 -b 256 -r 1 --expand-range 1,2 --ignore-0 -k 1 -v 3 --elasticsearch-url 52.23.165.123:9872/hg19.1/geneDensity3.bedgraph.txt.genome.sorted.gz.1
zcat ${FILEPATH}.genome.sorted.gz | /usr/bin/time python scripts/make_single_threaded_tiles.py --assembly hg19 -b 256 -r 1 --expand-range 1,2 --ignore-0 -k 1 -v 3 --elasticsearch-url 52.23.165.123:9872/${INDEX_NAME} --print-status
### Gene Information
AWS_ES_DOMAIN=54.197.186.181:9872
INPUT_FILE=~/data/hg19/genbank-output/refgene-count-minus
INDEX_NAME=hg19/refgene-tiles-minus
OUTPUT_DIR=~/data/${DATASET_NAME}
curl -XDELETE "http://${AWS_ES_DOMAIN}/${INDEX_NAME}/_query" -d '{
"query" : {
"match_all" : {}
}
}'
/usr/bin/time python scripts/make_tiles.py --elasticsearch-nodes ${AWS_ES_DOMAIN} --elasticsearch-path $INDEX_NAME -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 18 -i count --importance --reverse-importance --max-entries-per-tile 16 --assembly hg19 ${INPUT_FILE}
#/usr/bin/time python scripts/make_tiles.py --elasticsearch-nodes ${AWS_ES_DOMAIN} --elasticsearch-path $INDEX_NAME -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 18 -i count --importance --reverse-importance --max-entries-per-tile 16 --assembly hg19 /tmp/perm2
### Nometonome
AWS_ES_DOMAIN=52.23.165.123:9872
ASSEMBLY=GCF_000005845.2_ASM584v2_genomic
cat ~/projects/nometonome/contacts/GCF_000005845.2_ASM584v2_genomic.22.contacts.genome /usr/bin/time pypy scripts/make_single_threaded_tiles.py --min-pos 1,1 --max-pos 4641652,4641652 -b 256 -r 1 -v 3 --elasticsearch-url 52.23.165.123:9872/hg19.1/GCF_000005845.2_ASM584v2_genomic.22.contacts.genome
[
{
"chromInfoPath": "//s3.amazonaws.com/pkerp/data/hg19/chromInfo.txt",
"domain": [
0,
4641652
],
"viewStyle": {
"float": "left",
"padding": "5px",
"width": "100%"
},
"tracks": [
{
"source": "//52.23.165.123:9872/hg19.1/GCF_000005845.2_ASM584v2_genomic.22.contacts.genome",
"type": "heatmap",
"height": 300
}
],
"zoomLock": 0
}
]
AWS_ES_DOMAIN=52.23.165.123:9872
ASSEMBLY=hg19
DATASET_NAME=refGeneBed.bedGraph.txt.genome.sorted.gz
INDEX_NAME=hg19.x/${DATASET_NAME}
FILEPATH=/data/encode/${ASSEMBLY}/${DATASET_NAME}
zcat ${FILEPATH} | /usr/bin/time pypy scripts/make_single_threaded_tiles.py --assembly hg19 -b 256 -r 1 --expand-range 1,2 --ignore-0 -k 1 -v 3 --elasticsearch-url ${AWS_ES_DOMAIN}/${INDEX_NAME}
##########################################################################################
### HiC data
#########################################################################################
### Smaller test set
### Real data set
FILENAME=rao_et_al/HMEC/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved
FILENAME=rao_et_al/HMEC/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved
FILENAME=rao_et_al/HUVEC/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved
FILENAME=rao_et_al/IMR90/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved
FILENAME=rao_et_al/GM12878_primary/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved
DATASET_NAME=hg19/Dixon2015-H1hESC_ES-HindIII-allreps-filtered.1kb.genome.gz
FILENAME=coolers/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}
zcat $FILEPATH > ${FILEPATH}.mirrored
zcat $FILEPATH | awk '{ print $2 "\t" $1 "\t" $3; }' >> ${FILEPATH}.mirrored
head -n 40000000 ${FILEPATH}.mirrored.shuffled > ${FILEPATH}.short
#SPARK_HOME_DIR=/Users/peter/Downloads/spark-1.6.1
#SPARK_HOME_DIR=/home/ubuntu/apps/spark-1.6.1-bin-hadoop2.6
SPARK_HOME_DIR=~/spark-home
/usr/bin/time ${SPARK_HOME_DIR}/bin/spark-submit scripts/make_tiles.py -v count -p pos1,pos2 -c pos1,pos2,count -i count -r 1000 -b 256 --max-zoom 20 --output-format dense --use-spark ${FILEPATH}.short --elasticsearch-nodes localhost:9200 --elasticsearch-path test_shorter/tiles
# Run locally
# OUTPUT_DIR=${FILEPATH}.short.tiles; rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time python scripts/make_tiles.py -o $OUTPUT_DIR -v count -p pos1,pos2 -c pos1,pos2,count -i count -r 1000 -b 256 --max-zoom 20 --output-format dense ${FILEPATH}.short
/usr/bin/time ${SPARK_HOME_DIR}/bin/spark-submit scripts/make_tiles.py -v count -p pos1,pos2 -c pos1,pos2,count -i count -r 1000 -b 256 --max-zoom 20 --output-format dense --use-spark --elasticsearch-nodes localhost:9200 --elasticsearch-path ${DATASET_NAME} ${FILEPATH}.mirrored.shuffled
#OUTPUT_DIR=${FILEPATH}.tiles; rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time ${SPARK_HOME_DIR}/bin/spark-submit scripts/make_tiles.py -o $OUTPUT_DIR -v count -p pos1,pos2 -c pos1,pos2,count -i count -r 1000 -b 256 --max-zoom 20 --output-format dense --use-spark ${FILEPATH}.mirrored
#find $OUTPUT_DIR -name "*.json" | xargs chmod a+r
aws s3 sync --region us-west-2 ~/data/${FILENAME}.tiles s3://pkerp/data/${FILENAME}.tiles
##########################################################################################
### Gene annotations
#########################################################################################
ASSEMBLY=mm9
AWS_ES_DOMAIN=52.23.165.123:9872
DATASET_NAME=refgene-tiles-minus
INPUT_FILE=~/data/${ASSEMBLY}/genbank-output/refgene-count-minus
INDEX_NAME=hg19.1/${ASSEMBLY}.${DATASET_NAME}
/usr/bin/time python scripts/make_tiles.py --elasticsearch-nodes ${AWS_ES_DOMAIN} --elasticsearch-path $INDEX_NAME -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 18 -i count --importance --reverse-importance --max-entries-per-tile 16 $INPUT_FILE
DATASET_NAME=refgene-tiles-plus
INPUT_FILE=~/data/${ASSEMBLY}/genbank-output/refgene-count-plus
INDEX_NAME=hg19.1/${ASSEMBLY}.${DATASET_NAME}
/usr/bin/time python scripts/make_tiles.py --elasticsearch-nodes ${AWS_ES_DOMAIN} --elasticsearch-path $INDEX_NAME -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 18 -i count --importance --reverse-importance --max-entries-per-tile 16 $INPUT_FILE
#rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time python scripts/make_tiles.py -o $OUTPUT_DIR -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 5 -i count --importance --reverse-importance --max-entries-per-tile 16 $INPUT_FILE
#rsync -avzP $OUTPUT_DIR/ ~/projects/goomba/.tmp/jsons/${DATASET_NAME}
#rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time python scripts/make_tiles.py -o $OUTPUT_DIR -v count --position genomeTxStart --end-position genomeTxEnd -c refseqid,chr,strand,txStart,txEnd,genomeTxStart,genomeTxEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,geneName,count,uid --max-zoom 5 -i count --importance --reverse-importance --max-entries-per-tile 16 $INPUT_FILE
#rsync -avzP $OUTPUT_DIR/ ~/projects/goomba/.tmp/jsons/${DATASET_NAME}
#aws s3 sync $OUTPUT_DIR s3://pkerp/$OUTPUT_PART
#aws s3 sync $OUTPUT_DIR s3://pkerp/$OUTPUT_PART
##########################################################################################
## Wiggle Tracks from
## BEDGraph files
##########################################################################################
DATASET_NAME=hg19/E116-DNase.fc.signal.bigwig
FILENAME=ENCODE/${DATASET_NAME}
FILEPATH=~/data/${FILENAME}
#bigWigToBedGraph ${FILEPATH} ${FILEPATH}.bedGraph # 60 seconds
#cat ${FILEPATH}.bedGraph | awk '{print $1,$2,$1,$3,$4}' | chr_pos_to_genome_pos.py -e 4 > ${FILEPATH}.bedGraph.genome
head -n 1000000 ${FILEPATH}.bedGraph.genome > ${FILEPATH}.short
SPARK_HOME_DIR=~/spark-home
OUTPUT_DIR=${FILEPATH}.short.tiles; rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time ${SPARK_HOME_DIR}/bin/spark-submit scripts/make_tiles.py -v value -c pos1,pos2,value --position pos1 --range pos1,pos2 --range-except-0 value -i value --resolution 1 --bins-per-dimension 64 --max-zoom 20 --use-spark ${FILEPATH}.short -o $OUTPUT_DIR
aws s3 sync $OUTPUT_DIR s3://pkerp/data/served/$DATASET_NAME
#OUTPUT_DIR=${FILEPATH}.short.tiles; rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time ${SPARK_HOME_DIR}/bin/spark-submit scripts/make_tiles.py -v value -c chrom,pos1,pos2,value --position pos1 --range pos1,pos2 -i value --resolution 1 --bins-per-dimension 64 --max-zoom 20 --use-spark ${FILEPATH}.short --elasticsearch-nodes localhost:9200 --elasticsearch-path test_short/bed
SPARK_HOME_DIR=~/spark-home
DATASET_NAME=sample_data/E116-DNase.fc.signal.bigwig.bedGraph.genome.100000
FILEPATH=test/${DATASET_NAME}
OUTPUT_DIR=${FILEPATH}.tiles; rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR; /usr/bin/time ${SPARK_HOME_DIR}/bin/spark-submit scripts/make_tiles.py -v value -c pos1,pos2,value --position pos1 --range pos1,pos2 --range-except-0 value -i value --resolution 1 --bins-per-dimension 64 --max-zoom 5 --use-spark ${FILEPATH} -o $OUTPUT_DIR
rsync -avzP $OUTPUT_DIR/ ~/projects/goomba/.tmp/jsons/${DATASET_NAME}
#aws s3 sync $OUTPUT_DIR s3://pkerp/data/served/${DATASET_NAME}.tiles
## Small file
OUTPUT_DIR=output
rsync -a --delete blank/ $OUTPUT_DIR; mkdir -p $OUTPUT_DIR;
python scripts/make_tiles.py -o $OUTPUT_DIR -v value -c chrom,pos1,pos2,value --range pos1,pos2 -i value test/data/smallBedGraph.tsv --delimiter ' ' --position pos1 --resolution 1 --max-zoom 14 --output-format dense --bins-per-dimension 128
### Real file
OUTPUT_DIR=~/data/ENCODE/2016-05-16-GM12878-RNASeq/tiles
/usr/bin/time spark-submit --driver-memory 8G scripts/make_tiles.py -o $OUTPUT_DIR -v value -c chrom,pos1,pos2,value --range pos1,pos2 -i value --position pos1 --resolution 1 --max-zoom 14 --output-format dense --bins-per-dimension 128 ~/data/ENCODE/2016-05-16-GM12878-RNASeq/ENCFF000FAA_chr1.bedGraph --use-spark
aws s3 sync --region us-west-2 $OUTPUT_DIR s3://pkerp/data/ENCODE/2016-05-16-GM12878-RNASeq/tiles
## BAM files
samtools view -h data/bam/GM12878_SRR1658581_10pc_3_R1_hg19.bwt2glob.bam | head -n 65536 | samtools view -Sb > data/bam/65536.bam
####
Turn off logging in log4j.properties. Place the log4j.properties file in ~/.spark-conf and point spark to that directory:
export SPARK_CONF_DIR=~/.spark-conf
#### Create ElasticSearch mapping
curl -XGET "http://127.0.0.1:9200/test_short/_optimize"
curl -XGET "http://127.0.0.1:9200/test_short/_mapping"
curl -XGET "http://127.0.0.1:9200/test_short/_stats"
curl -XDELETE "http://search-es4dn-z7rzz4kevtoyh5pfjkmjg5jsga.us-east-1.es.amazonaws.com/hg19/Dixon2015-H1hESC_ES-HindIII-allreps-filtered.1kb.genome.gz.mirrored.shuffled/"
curl -XGET "http://search-es4dn-z7rzz4kevtoyh5pfjkmjg5jsga.us-east-1.es.amazonaws.com/hg19/Dixon2015-H1hESC_ES-HindIII-allreps-filtered.1kb.genome.gz.mirrored.shuffled/14.21.12077"
curl -XGET "search-es4dn-z7rzz4kevtoyh5pfjkmjg5jsga.us-east-1.es.amazonaws.com/hg19/Dixon2015-H1hESC_ES-HindIII-allreps-filtered.1kb.genome.gz.mirrored.shuffled/_search" -d '
{
"query" : {
"match_all" : {}
}
}'
curl -XDELETE "http://127.0.0.1:9200/hg19"
curl -XPUT "localhost:9200/hg19" -d '
curl -XDELETE "http://search-higlass-ssxwuix6kow3sekyeresi7ay5e.us-east-1.es.amazonaws.com/hg19"
curl -XPUT "http://search-higlass-ssxwuix6kow3sekyeresi7ay5e.us-east-1.es.amazonaws.com/hg19" -d '
{
"mappings": {
"_default_": {
"dynamic_templates": [
{ "notanalyzed": {
"match": "*",
"mapping": {
"index": "no"
}
}
}
]
}
}
}'
#########################################################################################
#### Preparing test data
###########################################################################################
head -n 20212 ~/data/ENCODE/hg19/E116-DNase.fc.signal.bigwig.bedGraph.genome > test/sample_data/E116-DNase.fc.signal.bigwig.bedGraph.genome.20212
head -n 100000 ~/data/ENCODE/hg19/E116-DNase.fc.signal.bigwig.bedGraph.genome > test/sample_data/E116-DNase.fc.signal.bigwig.bedGraph.genome.100000
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
clodius-0.1.1.tar.gz
(20.6 kB
view details)
File details
Details for the file clodius-0.1.1.tar.gz
.
File metadata
- Download URL: clodius-0.1.1.tar.gz
- Upload date:
- Size: 20.6 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | d7ae55614ad3805d86afe489ecc14b82516b6c777b0cb72ccf36ff58b7a57868 |
|
MD5 | d9da3a640cef67773f3aef65d3943e3a |
|
BLAKE2b-256 | e0107b5bd97cea00200a726bcba92b58af1777bf484f6267ee96e41c17a63143 |