from os.path import isfile
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
# Input and output file paths
path_in='run/processed.tsv.gz'
path_out='run/grn.tsv'
# Check if input file exists
if not isfile(path_in):
	raise NotImplementedError(f"File {path_in} not found.")

#### Your code:df=pd.read_csv(???)
df.head(3)

# Whether each row is the DE results of a gene targeted by the CRISPR KO
#### Your code:is_target=???
# Extract DE results of targeted genes
df_target_initial=df[is_target].set_index('target',drop=True).drop(columns=['gene'])
df_target_initial

fig,ax=plt.subplots(figsize=(3,2.5))
# Draw scatter plot of log2 fold change vs -log10(p-value)
#### Your code:ax.scatter(???,s=10)
ax.set_xlabel('log2 fold change');ax.set_ylabel('-log10(p-value)')
ax.set_title('Differential expression of perturbed genes'); plt.show()

# Raw P value cutoff for differential expression of targeted gene A
cut_p_target=1E-5
# LogFC cutoff for differential expression of targeted gene A
cut_lfc_target=0.5
# Identify perturbations leading to significant repression of the targeted gene
#### Your code:is_repression=(df_target_initial['p']<=???)&(???)
# Extract perturbations leading to significant repression
targets=df_target_initial.index[is_repression]
targets

Index(['AKAP8', 'BPTF', 'CBFB', 'CLOCK', 'ELK4', 'EPAS1', 'ETS1', 'FOXK1',
       'FOXP1', 'GATA3', 'IL2RA', 'IRF2', 'JAK3', 'KLF9', 'KMT2A', 'MBD2',
       'MED12', 'NFE2L3', 'NFKB1', 'PTEN', 'REL', 'RELA', 'RELB', 'RORC',
       'SETDB1', 'SREBF1', 'STAT1', 'STAT2', 'STAT3', 'STAT5A', 'STAT5B',
       'TBX21', 'TCF3', 'TP53', 'TTF1', 'YBX1', 'YY1', 'ZBTB24', 'ZKSCAN1',
       'ZNF217'],
      dtype='object', name='target')

# Filter DE results of targeted genes with perturbations leading to significant repression
df_target=df_target_initial.loc[targets]
# Filter DE results of untargeted genes with perturbations leading to significant repression
#### Your code:df=df[???]
df.shape

(67305, 4)

# Adjusted P value cutoff for differential expression of untargeted genes
cut_p=0.05
# LogFC cutoff for differential expression of untargeted genes
cut_lfc=0.5

# Initialize empty list of edges
edges=[]
# Use the first targeted gene to try your code
target=targets[0]
# Extract differential expression results for the targeted gene
#### Your code: s_target=???
# Extract differential expression results for the first perturbation
#### Your code:df1=df[???]
# Extract differential expression results for untargeted genes
#### Your code: df1_other=df1[???]
print(s_target)
df1_other.head(2)

p      3.611740e-14
lfc   -8.319442e-01
Name: AKAP8, dtype: float64

# Identify which genes are differentially expressed due to the perturbation
#### Your code:is_de=(???)&(???)
# Extract differentially expressed genes due to this perturbation
df1_de=df1_other[is_de]
# Use a loop to create edges in the GRN, one at a time iteratively from the targeted gene to each differentially expressed gene
for idx in df1_de.index:
	# Starting node of the edge
	#### Your code:node_start=???
	# Ending node of the edge
	#### Your code:node_end=???
	# Weight of the edge
	#### Your code:edge_weight=???
	# Add the edge
	edges.append((node_start,node_end,edge_weight))
print('Number of edges:',len(edges))
edges[:3]

Number of edges: 2

[('AKAP8', 'TSC22D4', np.float64(-0.6880808472308385)),
 ('AKAP8', 'FRMD8', np.float64(0.6606878405909893))]

# Re-initialize empty list of edges
edges=[]
for target in targets:
	# Paste all your code in above two code blocks after the line `target=targets[0]`. Exclude `print` lines. Increase indentation.
	#### Your code starts here ####
	???
	##### Your code ends here #####
print('Number of edges:',len(edges))
edges[-3:]

Number of edges: 10418

[('ZNF217', 'BCORP1', np.float64(-0.8909980321828219)),
 ('ZNF217', 'ENSG00000289707', np.float64(-0.6030418232529909)),
 ('ZNF217', 'MT-TI', np.float64(0.6806038568444858))]

df_grn=pd.DataFrame(edges,columns=['source','target','weight'])
grn=nx.from_pandas_edgelist(df_grn,source='source',target='target',edge_attr='weight',create_using=nx.DiGraph())
df_grn.head()

# Save edge properties into tsv file
#### Your code:df_grn.to_csv(path_out,???)
# Show the first 4 lines of the file
with open(path_out,'r') as f:
	for _ in range(4):
		print(f.readline().rstrip())

source	target	weight
AKAP8	TSC22D4	-0.6880808472308385
AKAP8	FRMD8	0.6606878405909893
BPTF	ENSG00000217801	-0.9996829704895553

# Number of nodes
#### Your code:nn=???
# Number of edges
#### Your code:ne=???
# Out-degree (number of target genes)
#### Your code:odegree=???
# Convert to pandas.Series
odegree=pd.Series(dict(odegree))
# Limit to nodes with at least one target
#### Your code:odegree=odegree[???]
# Sort out-degree in descending order
#### Your code:???
# Create a dataframe of nodes with network information
df_nodes=pd.DataFrame(odegree,index=odegree.index,columns=['outdegree'])
print(f'Number of nodes/regulators/edges: {nn}/{len(odegree)}/{ne}')
odegree.head()

Number of nodes/regulators/edges: 5100/34/10418

CBFB     2052
MED12    1787
YY1      1038
ETS1      752
PTEN      687
dtype: int64

fig,ax=plt.subplots(figsize=(3.2,2))
outdegree_values=list(odegree.values)
occurence=[outdegree_values.count(x) for x in set(outdegree_values)]
ax.stem(list(set(outdegree_values)),occurence,linefmt='black',markerfmt='ko',basefmt='none')
ax.set_xscale('log')
ax.set_xlabel('Out-degree (number of target genes)');ax.set_ylabel('Count')
ax.set_title('Out-degree distribution'); plt.show()

# Merge with differential expression results
#### Your code:df_volcano=pd.merge(...,df_nodes,left_index=True,right_index=True,how='inner')
fig,ax=plt.subplots(figsize=(3,2))
# Draw scatter plot of log2 fold change vs -log10(p-value)
#### Your code:df_volcano['lfc'],-np.log10(df_volcano['p']),s=np.log(df_volcano[...])*10,c='k',alpha=0.4,lw=0
ax.set_xlabel('log2 fold change');ax.set_ylabel('-log10(p-value)')
ax.set_title('Differential expression of perturbed genes'); plt.show()

	gene	target	p	lfc
0	ISG15	AKAP8	0.385355	-0.251955
1	ACOT7	AKAP8	0.349425	-0.264555
2	TNFRSF25	AKAP8	0.277335	0.236204

	p	lfc
target
AKAP8	3.611740e-14	-0.831944
ARID5A	2.570915e-02	0.274388
ATXN7L3	2.849576e-03	0.242471
BPTF	1.747578e-36	-0.778508
CBFB	1.005515e-153	-1.723185
...	...	...
ZKSCAN1	2.101872e-30	-1.034257
ZNF217	1.588385e-27	-0.845768
ZNF319	3.180268e-14	0.900860
ZNF341	3.103559e-03	0.795138
ZNF655	2.228071e-03	0.197382

	gene	target	p	lfc
0	ISG15	AKAP8	0.385355	-0.251955
1	ACOT7	AKAP8	0.349425	-0.264555

Gene Regulation III - Workshop¶

aka Gene Regulatory Network (GRN)¶

Introduction¶

Practicalities¶

How: imagine you are doing research¶

TOC¶

0. Preparation¶

1. Load DE results¶

2. List perturbations and DE results of associated genes¶

3. Check $P_A-|A$ visually with volcano plot¶

3. Find perturbations with significant repression¶

3. Retain perturbations with significant repression¶

4. Test $P_A$->$B$ or $P_A$-|$B$ - parameters¶

4. Test $P_A$->$B$ or $P_A$-|$B$ - 1 perturbation¶

4. Test $P_A$->$B$ or $P_A$-|$B$ - 1 perturbation¶

4. Now use a loop to iterate over all remaining perturbations¶

4. Create pandas.Dataframe and networkx.DiGraph objects¶

5. Save network to file¶

5. Install Cytoscape (optional)¶

6. Basic properties of the network¶

6. Plot out-degree distribution¶

6. Redraw valcano plot with out-degree¶

7. Load network into Cytoscape¶

7. Assign column properties¶

7. Show network details¶

7. Search for a gene¶

7. Get a subnetwork¶

7. Show arrow direction¶

7. Change edge color¶

8. The END¶

	source	target	weight
0	AKAP8	TSC22D4	-0.688081
1	AKAP8	FRMD8	0.660688
2	BPTF	ENSG00000217801	-0.999683
3	BPTF	ZNF593	1.284137
4	BPTF	TMEM39B	0.777404