API Reference
API reference for developers.
Find and score peaks
Call peaks from a coverage track and score them using the LanceOTron model.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
file |
str |
Path to bigwig track. |
required |
threshold |
int |
Initial threshold used for selecting candidate peaks. Defaults to 4. |
4 |
window |
int |
Window size for rolling mean to use for selecting candidate peaks. Defaults to 400. |
400 |
folder |
str |
Output folder. Defaults to "./". |
'./' |
skipheader |
bool |
Skip writing header. Defaults to False. |
False |
Source code in lanceotron/modules.py
def find_and_score_peaks(
file: str,
threshold: int = 4,
window: int = 400,
folder: str = "./",
skipheader: bool = False) -> None:
"""Call peaks from a coverage track and score them using the LanceOTron model.
Args:
file (str): Path to bigwig track.
threshold (int, optional): Initial threshold used for selecting candidate peaks. Defaults to 4.
window (int, optional): Window size for rolling mean to use for selecting candidate peaks. Defaults to 400.
folder (str, optional): Output folder. Defaults to "./".
skipheader (bool, optional): Skip writing header. Defaults to False.
"""
bigwig_file = file
out_folder = make_directory_name(folder)
initial_threshold = threshold
min_peak_width = 50
max_peak_width = 2000
read_coverage_factor = 10**9
out_file_name = bigwig_file.split('/')[-1].split('.')[0]+'_L-tron.bed'
pyBigWig_object = pyBigWig.open(bigwig_file)
read_coverage_total = pyBigWig_object.header()['sumData']
read_coverage_rphm = read_coverage_total/read_coverage_factor
pyBigWig_object.close()
bigwig_data = Ltron.Bigwig_data(bigwig_file)
genome_stats_dict = bigwig_data.get_genome_info()
bed_file_out = []
for chrom in genome_stats_dict:
print(chrom)
coverage_array_smooth = bigwig_data.make_chrom_coverage_map(genome_stats_dict[chrom], smoothing=window)
enriched_region_coord_list = Ltron.label_enriched_regions_dynamic_threshold_width(coverage_array_smooth, genome_stats_dict[chrom]['chrom_mean']*initial_threshold, genome_stats_dict[chrom]['chrom_mean'], max_peak_width, min_region_size=min_peak_width)
chrom_file_out = []
if enriched_region_coord_list:
wide_path = pkg_resources.resource_filename('lanceotron.static', 'standard_scaler_wide_v5_03.p')
deep_path = pkg_resources.resource_filename('lanceotron.static', 'standard_scaler_deep_v5_03.p')
coverage_array = bigwig_data.make_chrom_coverage_map(genome_stats_dict[chrom])/read_coverage_rphm
X_wide_array, X_deep_array = Ltron.extract_signal_wide_and_deep_chrom(coverage_array, enriched_region_coord_list, read_coverage_rphm)
standard_scaler_wide = pickle.load(open(wide_path, 'rb'))
X_wide_array_norm = standard_scaler_wide.transform(X_wide_array)
X_wide_array_norm = np.expand_dims(X_wide_array_norm, axis=2)
standard_scaler = StandardScaler()
X_deep_array_norm_T = standard_scaler.fit_transform(X_deep_array.T)
standard_scaler_deep = pickle.load(open(deep_path, 'rb'))
X_deep_array_norm = standard_scaler_deep.transform(X_deep_array_norm_T.T)
X_deep_array_norm = np.expand_dims(X_deep_array_norm, axis=2)
model = build_model()
model_classifications = model.predict([X_deep_array_norm, X_wide_array_norm], verbose=1)
K.clear_session()
for i, coord_pair in enumerate(enriched_region_coord_list):
out_list = [chrom, coord_pair[0], coord_pair[1], model_classifications[0][i][0], model_classifications[1][i][0], model_classifications[2][i][0]]
X_wide_list = X_wide_array[i][:-1].tolist()
X_wide_list = [100. if x>10 else x for x in X_wide_list]
out_list+=X_wide_list
chrom_file_out.append(out_list)
bed_file_out+=chrom_file_out
with open(out_folder+out_file_name, 'w', newline='') as f:
if not skipheader:
f.write('chrom\tstart\tend\toverall_peak_score\tshape_score\tenrichment_score\tpvalue_chrom\tpvalue_10kb\tpvalue_20kb\tpvalue_30kb\tpvalue_40kb\tpvalue_50kb\tpvalue_60kb\tpvalue_70kb\tpvalue_80kb\tpvalue_90kb\tpvalue_100kb\n')
bed_writer = csv.writer(f, delimiter='\t')
bed_writer.writerows(bed_file_out)
Call peaks with input
Call peaks from a coverage track with an input and score them using the LanceOTron model.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
file |
str |
Path to bigwig track. |
required |
input |
str |
Control input track used to calculate Poisson-based significance of peaks. |
required |
threshold |
int |
Initial threshold used for selecting candidate peaks. Defaults to 4. |
4 |
window |
int |
Window size for rolling mean to use for selecting candidate peaks. Defaults to 400. |
400 |
folder |
str |
Output folder. Defaults to "./". |
'./' |
skipheader |
bool |
Skip writing header. Defaults to False. |
False |
Source code in lanceotron/modules.py
def call_peaks_with_input(
file: str,
input: str,
threshold: int = 4,
window: int = 400,
folder: str = "./",
skipheader: bool = False) -> None:
"""Call peaks from a coverage track with an input and score them using the LanceOTron model.
Args:
file (str): Path to bigwig track.
input (str): Control input track used to calculate Poisson-based significance of peaks.
threshold (int, optional): Initial threshold used for selecting candidate peaks. Defaults to 4.
window (int, optional): Window size for rolling mean to use for selecting candidate peaks. Defaults to 400.
folder (str, optional): Output folder. Defaults to "./".
skipheader (bool, optional): Skip writing header. Defaults to False.
"""
bigwig_file = file
control_file = input
out_folder = make_directory_name(folder)
initial_threshold = threshold
min_peak_width = 50
max_peak_width = 2000
read_coverage_factor = 10**9
out_file_name = bigwig_file.split('/')[-1].split('.')[0]+'_L-tron.bed'
pyBigWig_object = pyBigWig.open(bigwig_file)
read_coverage_total = pyBigWig_object.header()['sumData']
read_coverage_rphm = read_coverage_total/read_coverage_factor
pyBigWig_object.close()
bigwig_data = Ltron.Bigwig_data(bigwig_file)
genome_stats_dict = bigwig_data.get_genome_info()
bed_file_out = []
for chrom in genome_stats_dict:
print(chrom)
coverage_array_smooth = bigwig_data.make_chrom_coverage_map(genome_stats_dict[chrom], smoothing=window)
enriched_region_coord_list = Ltron.label_enriched_regions_dynamic_threshold_width(coverage_array_smooth, genome_stats_dict[chrom]['chrom_mean']*initial_threshold, genome_stats_dict[chrom]['chrom_mean'], max_peak_width, min_region_size=min_peak_width)
chrom_file_out = []
if enriched_region_coord_list:
wide_path = pkg_resources.resource_filename('lanceotron.static', 'standard_scaler_wide_v5_03.p')
deep_path = pkg_resources.resource_filename('lanceotron.static', 'standard_scaler_deep_v5_03.p')
coverage_array = bigwig_data.make_chrom_coverage_map(genome_stats_dict[chrom])/read_coverage_rphm
X_wide_array, X_deep_array = Ltron.extract_signal_wide_and_deep_chrom(coverage_array, enriched_region_coord_list, read_coverage_rphm)
standard_scaler_wide = pickle.load(open(wide_path, 'rb'))
X_wide_array_norm = standard_scaler_wide.transform(X_wide_array)
X_wide_array_norm = np.expand_dims(X_wide_array_norm, axis=2)
standard_scaler = StandardScaler()
X_deep_array_norm_T = standard_scaler.fit_transform(X_deep_array.T)
standard_scaler_deep = pickle.load(open(deep_path, 'rb'))
X_deep_array_norm = standard_scaler_deep.transform(X_deep_array_norm_T.T)
X_deep_array_norm = np.expand_dims(X_deep_array_norm, axis=2)
model = build_model()
model_classifications = model.predict([X_deep_array_norm, X_wide_array_norm], verbose=1)
pyBigWig_input = pyBigWig.open(control_file)
read_coverage_total_input = pyBigWig_input.header()['sumData']
read_coverage_rphm_input = read_coverage_total_input/read_coverage_factor
K.clear_session()
for i, coord_pair in enumerate(enriched_region_coord_list):
average_cov = coverage_array[coord_pair[0]:coord_pair[1]].mean()*read_coverage_rphm
pvalue_input = calculate_pvalue_from_input(chrom, coord_pair[0], coord_pair[1], read_coverage_total, read_coverage_total_input, pyBigWig_input, average_cov)
out_list = [chrom, coord_pair[0], coord_pair[1], model_classifications[0][i][0], model_classifications[1][i][0], model_classifications[2][i][0], pvalue_input]
X_wide_list = X_wide_array[i][:-1].tolist()
X_wide_list = [100. if x>10 else x for x in X_wide_list]
out_list+=X_wide_list
chrom_file_out.append(out_list)
pyBigWig_input.close()
bed_file_out+=chrom_file_out
with open(out_folder+out_file_name, 'w', newline='') as f:
if not skipheader:
f.write('chrom\tstart\tend\toverall_peak_score\tshape_score\tenrichment_score\tpvalue_input\tpvalue_chrom\tpvalue_10kb\tpvalue_20kb\tpvalue_30kb\tpvalue_40kb\tpvalue_50kb\tpvalue_60kb\tpvalue_70kb\tpvalue_80kb\tpvalue_90kb\tpvalue_100kb\n')
bed_writer = csv.writer(f, delimiter='\t')
bed_writer.writerows(bed_file_out)
Score a bed file
Score an existing bed file using LanceOTron's model and a coverage track.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
file |
str |
Input bigWig track. |
required |
bed |
str |
Bed file of regions to score. |
required |
folder |
str |
Output folder. Defaults to "./". |
'./' |
skipheader |
bool |
Skip writing header. Defaults to False. |
False |
Source code in lanceotron/modules.py
def score_bed(
file: str,
bed: str,
folder: str = "./",
skipheader: bool = False) -> None:
"""Score an existing bed file using LanceOTron's model and a coverage track.
Args:
file (str): Input bigWig track.
bed (str): Bed file of regions to score.
folder (str): Output folder. Defaults to "./".
skipheader (bool): Skip writing header. Defaults to False.
"""
bigwig_file = file
out_folder = make_directory_name(folder)
bed_file = bed
read_coverage_factor = 10**9
out_file_name = bigwig_file.split('/')[-1].split('.')[0]+'_L-tron.bed'
pyBigWig_object = pyBigWig.open(bigwig_file)
read_coverage_total = pyBigWig_object.header()['sumData']
read_coverage_rphm = read_coverage_total/read_coverage_factor
pyBigWig_object.close()
bed_list = Ltron.bed_file_to_list(bed_file)
chroms_in_bed = []
for bed_entry in bed_list:
if bed_entry[0] not in chroms_in_bed:
chroms_in_bed.append(bed_entry[0])
bigwig_data = Ltron.Bigwig_data(bigwig_file)
genome_stats_dict = bigwig_data.get_genome_info(include_special_chromosomes=True)
bed_file_out = []
for chrom in chroms_in_bed:
print(chrom)
enriched_region_coord_list = []
for bed_entry in bed_list:
if bed_entry[0]==chrom:
enriched_region_coord_list.append([bed_entry[1], bed_entry[2]])
chrom_file_out = []
if enriched_region_coord_list:
wide_path = pkg_resources.resource_filename('lanceotron.static', 'standard_scaler_wide_v5_03.p')
deep_path = pkg_resources.resource_filename('lanceotron.static', 'standard_scaler_deep_v5_03.p')
coverage_array = bigwig_data.make_chrom_coverage_map(genome_stats_dict[chrom])/read_coverage_rphm
X_wide_array, X_deep_array = Ltron.extract_signal_wide_and_deep_chrom(coverage_array, enriched_region_coord_list, read_coverage_rphm)
standard_scaler_wide = pickle.load(open(wide_path, 'rb'))
X_wide_array_norm = standard_scaler_wide.transform(X_wide_array)
X_wide_array_norm = np.expand_dims(X_wide_array_norm, axis=2)
standard_scaler = StandardScaler()
X_deep_array_norm_T = standard_scaler.fit_transform(X_deep_array.T)
standard_scaler_deep = pickle.load(open(deep_path, 'rb'))
X_deep_array_norm = standard_scaler_deep.transform(X_deep_array_norm_T.T)
X_deep_array_norm = np.expand_dims(X_deep_array_norm, axis=2)
model = build_model()
model_classifications = model.predict([X_deep_array_norm, X_wide_array_norm], verbose=1)
K.clear_session()
for i, coord_pair in enumerate(enriched_region_coord_list):
out_list = [chrom, coord_pair[0], coord_pair[1], model_classifications[0][i][0], model_classifications[1][i][0], model_classifications[2][i][0]]
X_wide_list = X_wide_array[i][:-1].tolist()
X_wide_list = [100. if x>10 else x for x in X_wide_list]
out_list+=X_wide_list
chrom_file_out.append(out_list)
bed_file_out+=chrom_file_out
with open(out_folder+out_file_name, 'w', newline='') as f:
if not skipheader:
f.write('chrom\tstart\tend\toverall_peak_score\tshape_score\tenrichment_score\tpvalue_chrom\tpvalue_10kb\tpvalue_20kb\tpvalue_30kb\tpvalue_40kb\tpvalue_50kb\tpvalue_60kb\tpvalue_70kb\tpvalue_80kb\tpvalue_90kb\tpvalue_100kb\n')
bed_writer = csv.writer(f, delimiter='\t')
bed_writer.writerows(bed_file_out)
Construct the LanceOTron model
Build and return the Lanceotron Keras model.
Source code in lanceotron/utils.py
def build_model():
"""
Build and return the Lanceotron Keras model.
"""
deep_dense_size = 10
dropout_rate = 0.5
first_filter_num = 70
first_filter_size = 9
hidden_filter_num = 120
hidden_filter_size = 6
learning_rate = 0.0001
wide_and_deep_dense_size = 70
input_wide = keras.layers.Input(shape=(12, 1))
wide_model = keras.layers.Flatten()(input_wide)
input_deep = keras.layers.Input((2000, 1))
deep_model = input_deep
# deep model first conv layer
deep_model = keras.layers.Convolution1D(first_filter_num, kernel_size=first_filter_size, padding='same')(deep_model)
deep_model = keras.layers.BatchNormalization()(deep_model)
deep_model = keras.layers.LeakyReLU()(deep_model)
# deep model - 4 conv blocks
deep_model = keras.layers.Convolution1D(hidden_filter_num, kernel_size=hidden_filter_size, padding='same')(deep_model)
deep_model = keras.layers.BatchNormalization()(deep_model)
deep_model = keras.layers.LeakyReLU()(deep_model)
deep_model = keras.layers.MaxPool1D(pool_size=2)(deep_model)
deep_model = keras.layers.Convolution1D(hidden_filter_num, kernel_size=hidden_filter_size, padding='same')(deep_model)
deep_model = keras.layers.BatchNormalization()(deep_model)
deep_model = keras.layers.LeakyReLU()(deep_model)
deep_model = keras.layers.MaxPool1D(pool_size=2)(deep_model)
deep_model = keras.layers.Convolution1D(hidden_filter_num, kernel_size=hidden_filter_size, padding='same')(deep_model)
deep_model = keras.layers.BatchNormalization()(deep_model)
deep_model = keras.layers.LeakyReLU()(deep_model)
deep_model = keras.layers.MaxPool1D(pool_size=2)(deep_model)
deep_model = keras.layers.Convolution1D(hidden_filter_num, kernel_size=hidden_filter_size, padding='same')(deep_model)
deep_model = keras.layers.BatchNormalization()(deep_model)
deep_model = keras.layers.LeakyReLU()(deep_model)
deep_model = keras.layers.MaxPool1D(pool_size=2)(deep_model)
# deep model - dense layer with dropout
deep_model = keras.layers.Dense(deep_dense_size)(deep_model)
deep_model = keras.layers.BatchNormalization()(deep_model)
deep_model = keras.layers.LeakyReLU()(deep_model)
deep_model = keras.layers.Dropout(dropout_rate)(deep_model)
deep_model = keras.layers.Flatten()(deep_model)
# shape output only dense layer
shape_output = keras.layers.Dense(2, activation='softmax', name='shape_classification')(deep_model)
# p-value output only dense layer
pvalue_output = keras.layers.Dense(2, activation='softmax', name='pvalue_classification')(wide_model)
# combine wide and deep paths
concat = keras.layers.concatenate([wide_model, deep_model, pvalue_output])
wide_and_deep = keras.layers.Dense(wide_and_deep_dense_size)(concat)
wide_and_deep = keras.layers.BatchNormalization()(wide_and_deep)
wide_and_deep = keras.layers.LeakyReLU()(wide_and_deep)
wide_and_deep = keras.layers.Dense(wide_and_deep_dense_size)(wide_and_deep)
wide_and_deep = keras.layers.BatchNormalization()(wide_and_deep)
wide_and_deep = keras.layers.LeakyReLU()(wide_and_deep)
output = keras.layers.Dense(2, activation='softmax', name='overall_classification')(wide_and_deep)
model = keras.models.Model(inputs=[input_deep, input_wide], outputs=[output, shape_output, pvalue_output])
# load model weights
model.load_weights(pkg_resources.resource_filename("lanceotron.static", "wide_and_deep_fully_trained_v5_03.h5"))
return model