# Copyright (c) 2020 Brit Nicholson. # All rights reserved. This program is made available under # the terms of the Simple Non Code License (SNCL) v2.3.0, # which is available at # https://github.com/MysteryDash/Simple-Non-Code-License/blob/master/License.txt # This program finds the largest distinct DNA segments you have based on your DNA matches. # Optionally, you can run the code a second time. The goal would be that one run would find paternal segments and the other would find maternal segments, although, even before attempting such a program, one would expect the result to be far from perfect. # This code was developed using Python 3. import csv import pandas as pd DATA_FILE = 'Your MyHeritage Segment-Level Matching File Name' fields = ['Name', 'Match name', 'Chromosome', 'Start Location', 'End Location', 'Centimorgans', 'SNPs'] # Read in the data file df_file = pd.read_csv(DATA_FILE, delimiter=',', usecols=fields, encoding='latin-1') # Get rid of NaN values df_file.dropna(inplace=True) # Get rid of segments that match on less than 2300 SNPs df_raw = df_file[df_file['SNPs'] >= 2300].copy().drop_duplicates() # Get rid of unused columns. Match name isn’t needed for the program. It will be merged back in at the end. df_raw.drop(['Name','Match name','SNPs'], axis=1, inplace=True) df_raw.columns = ['Chr.', 'Start', 'End', 'cM'] # Rename columns df_map = df_file.copy().drop_duplicates() del df_file df_map.drop(['Name','Centimorgans'], axis=1, inplace=True) df_map.columns = ['Name', 'Chr.', 'Start', 'End', 'SNPs'] # Initialize the output dataframe that will print results to a file. # This will be the same whether phase 2 is run or only phase 1. df_out = pd.DataFrame(columns=['Chr.','Start','End','cM']) # Loop through all 22 chromosomes for chro_i in range(1, 23): df = df_raw[df_raw['Chr.'] == chro_i].copy().drop(['Chr.'], axis=1) # Need to sort by Start, then by cM, otherwise a smaller segment could get saved even if it's followed by a larger segment w/ the same start value (because the first segment has Overlap = NaN) df = df.sort_values(['Start', 'cM'], ascending=[True, False]).reset_index(drop=True) df['MaxEnd'] = df['End'].cummax() # Drop segments that are fully contained w/in a previous segment. df.drop(df[df['MaxEnd'] > df['End']].index, inplace=True) df.drop('MaxEnd', axis=1, inplace=True) # This will be for phase 1 df1 = df.copy() df1 = df1.sort_values(['cM', 'Start'], ascending=[False, True]).reset_index(drop=True) # df1out = df1.loc[0][:] # creates a series df1out = pd.DataFrame({'Start' : [df1.loc[0][0]], 'End' : [df1.loc[0][1]], 'cM' : [df1.loc[0][2]]}) cols = list(df1out.columns) a, b, c = cols.index('End'), cols.index('Start'), cols.index('cM') cols[b], cols[a], cols[c] = cols[a], cols[b], cols[c] df1out = df1out[cols] new = [False] # 0 for old, 1 for new # Make a column called 'New'. New rows are the only ones that can be dropped from the output. df1out.insert(loc=3, column='New', value=new) # Find non-overlapping segments for phase 1. for i in range(1, len(df1.index)): # Append a new (ith) row to the ouput newrow = pd.DataFrame({'Start' : [df1.loc[i][0]], 'End' : [df1.loc[i][1]], 'cM' : [df1.loc[i][2]], 'New' : True}) df1out = df1out.append(newrow.loc[0, :]) # Overlap will be calculated between a segment and the previous one, so rows need to be sorted based on start values. df1out = df1out.sort_values(['Start', 'cM'], ascending=[True, False]).reset_index(drop=True) # Calculate the overlap between a segment and the one from the previous row df1out['Overlap'] = df1out['End'].shift(1) - df1out['Start'] # Checks overlap for all rows in case the new row has too much overlap. # Maybe in the future change it to check only the new row and those before/after it, which would be faster if (df1out['Overlap'] > 1000000).any(): # Drop new seg. if any rows have overlap df1out.drop(df1out[df1out['New'] == True].index, inplace=True) # Need to change any New = 1 to New = 0 before the next iteration in case a new row remains in the dataframe df1out.loc[:,'New'] = 0 # End of 'for loop' to find non-overlapping segs. for a given chromosome del df1 df1out.drop(['New', 'Overlap'], axis=1, inplace=True) # Need to keep df1out for phase 2 ################# # Begin Phase 2 # ################# # Keep only rows that didn't make it to the output of phase 1. The following method will only work if duplicates have already been removed from both dataframes (which is true): df2 = pd.concat([df, df1out]).drop_duplicates(keep=False).sort_values(['cM', 'Start'],ascending=[False, True]).reset_index(drop=True) del df # Most of the rest of this code is trying to find a large cM row that overlaps a Phase 1 output row by ~50% # Need to delete everything except for the top ten segs. or so. using df1out (about top 10--change it once I have enough rows) for better coverage and to make it likely that the Phase 2 output is from the other homologue. df1_test = df1out.sort_values(['cM', 'Start'], ascending=[False, True]).reset_index(drop=True).astype(int) # Use the 6 largest segs. from Phase 1 to find a good overlapping seg. df1_test = df1_test.iloc[0:5] # Ensure decent coverage from df2_test df1_start = df1_test['Start'].min() df1_end = df1_test['End'].max() # Could change this 7, maybe to the average number of segments (minus one) in the output of Phase 1: bin_num = 7 # Makes n + 1 bins bin_start = list(range(0, df1_end - int((df1_end - df1_start)/bin_num), int((df1_end - df1_start)/bin_num))) bin_end = list(range(int((df1_end - df1_start)/bin_num), df1_end, int((df1_end - df1_start)/bin_num))) df2_test = pd.DataFrame({'Start' : bin_start, 'End' : bin_end}) cols = list(df2_test.columns) a, b = cols.index('End'), cols.index('Start') cols[b], cols[a] = cols[a], cols[b] df2_test = df2_test[cols] df2_test['cM'] = 0 df2_test = pd.concat([df2, df2_test]).sort_values(['Start', 'cM'], ascending=[True, False]).reset_index(drop=True) # df2_test will become really large, but the next line will only keep the largest cM rows in between the ones made above w/ 0 cM # Resulting segs. are large w/ good coverage for comparing to df1_out. Have confirmed that some can overlap. df2_test.drop(df2_test[(df2_test['cM'] == 0) | (df2_test['cM'] <= df2_test['cM'].shift(1))].index, inplace=True) # Make sure df2 is sorted by cM and indices are reset # Might want to add back in the ~10 largest segs. just in case the above ones don't have good overlap df2_test = pd.concat([df2.iloc[0:20], df2_test]).drop_duplicates().reset_index(drop=True) df2_test_out = pd.DataFrame(columns=['Start','End','cM','Overlap']) for i in range(0, len(df2_test.index)): for j in range(0, len(df1_test.index)): # Ensure there is overlap: if (df2_test.loc[i]['Start'] < df1_test.loc[j]['End']) & (df1_test.loc[j]['Start'] < df2_test.loc[i]['End']): overlap = df2_test.loc[i]['End'] - df1_test.loc[j]['Start'] if overlap < 0: overlap = df1_test.loc[j]['End'] - df2_test.loc[i]['Start'] # Get the abs. fraction of overlap overlap = abs(df2_test.loc[i]['End'] - df2_test.loc[i]['Start'])/overlap if (0.25 < overlap) & (overlap < 0.75): newrow = pd.DataFrame({'Start' : [df2_test.loc[i]['Start']], 'End' : [df2_test.loc[i]['End']], 'cM' : [df2_test.loc[i]['cM']], 'Overlap' : overlap}) df2_test_out = df2_test_out.append(newrow.loc[0, :]) # Break the inner (j) loop, since df1_test is only non-overlapping segs. break del df1_test del df2_test df2_test_out['Overlap'] = abs(0.5 - df2_test_out['Overlap']) df2_test_out['Overlap'] = (df2_test_out['cM']**3)/(df2_test_out['Overlap']) df2out = df2_test_out[df2_test_out['Overlap'] == df2_test_out['Overlap'].max()].copy() df2out.drop('Overlap', axis=1, inplace=True) del df2_test_out new = [False] # 0 for old, 1 for new df2out.insert(loc=3, column='New', value=new) # Find non-overlapping segments for phase 2. for i in range(1, len(df2.index)): # Append a new (ith) row to the ouput newrow = pd.DataFrame({'Start' : [df2.loc[i][0]], 'End' : [df2.loc[i][1]], 'cM' : [df2.loc[i][2]], 'New' : True}) df2out = df2out.append(newrow.loc[0, :]) # Overlap will be calculated between a segment and the previous one, so rows need to be sorted based on start values. df2out = df2out.sort_values(['Start', 'cM'], ascending=[True, False]).reset_index(drop=True) # Calculate the overlap between a segment and the one from the previous row df2out['Overlap'] = df2out['End'].shift(1) - df2out['Start'] # Checks overlap for all rows in case the new row has too much overlap. # Maybe in the future change it to check only the new row and those before/after it, which would be faster if (df2out['Overlap'] > 1000000).any(): # Drop new seg. if any rows have overlap df2out.drop(df2out[df2out['New'] == True].index, inplace=True) # Need to change any New = 1 to New = 0 before the next iteration in case a new row remains in the dataframe df2out.loc[:,'New'] = 0 del df2 # End of 'for loop' to find non-overlapping segs. for a given chromosome df1out.insert(loc=0, column='Chr.', value=chro_i) df1out.loc[:,'Chr.'] = chro_i df2out.drop(['New', 'Overlap'], axis=1, inplace=True) df2out.insert(loc=0, column='Chr.', value=chro_i) df2out.loc[:,'Chr.'] = chro_i df_out = df_out.append(df1out) del df1out df_out = df_out.append(df2out) del df2out # 'For loop' ends here for chrs. 1-22 df_out = df_out.merge(df_map, how='left', on=['Chr.','Start','End'], copy=False) cols = list(df_out.columns) a, b, c, d, e = cols.index('Chr.'), cols.index('Start'), cols.index('End'), cols.index('Name'), cols.index('cM') cols[a], cols[b], cols[c], cols[e], cols[d] = cols[a], cols[b], cols[c], cols[d], cols[e] df_out = df_out[cols] print(df_out) # Need to drop index first or while printing df_out.to_csv('Your MyHeritage Segment-Level Matching File Name.csv', index=False)
Feel free to ask me about modeling & simulation, genetic genealogy, or genealogical research. To see my articles on Medium, click here. And try out a nifty calculator that’s based on the first of my three genetic models. It lets you find the amount of an ancestor’s DNA you have when combined with various relatives. And most importantly, check out these ranges of shared DNA percentages or shared centiMorgans, which are the only published values that match known standard deviations.
Sorry, but I’m confused. How do I use this?
Hi Julie,
This would be for a very small subset of genealogists. One would have to be able to use Python to run it. The result is probably similar to what you can get from DNA Painter.