I’ve written two Python algorithms that I describe in this article. The algorithm below works for downloaded segment-level match files from GEDmatch. The other algorithm, found here, is for match files downloaded from MyHeritage.

# Copyright (c) 2020 Brit Nicholson.
# All rights reserved. This program is made available under
# the terms of the Simple Non Code License (SNCL) v2.3.0,
# which is available at
# https://github.com/MysteryDash/Simple-Non-Code-License/blob/master/License.txt

import csv
import pandas as pd

DATA_FILE = 'Your GEDmatch Segment-Level Matching File Name.csv'
fields = ['PrimaryKit','MatchedKit','chr','B37Start','B37End','Segment cM','SNPs','MatchedName','Matched Sex','MatchedEmail']

# Read in the data file
df_file = pd.read_csv(DATA_FILE, delimiter=',', usecols=fields, encoding='latin-1')

# Get rid of NaN values
df_file.dropna(inplace=True)
df_file = df_file.loc[~((df_file['SNPs'] < 200)),:]
df_file['chr'] = df_file['chr'].astype(str)

# Get rid of segments that match on less than 2300 SNPs
df_raw = df_file.copy().drop_duplicates()

# Get rid of unused columns. Match name isn’t needed for the program. It will be merged back in at the end.
df_raw.drop(['PrimaryKit','MatchedKit','SNPs','MatchedName','Matched Sex','MatchedEmail'], axis=1, inplace=True)

df_raw.columns = ['Chr.', 'Start', 'End', 'cM'] # Rename columns

df_map = df_file.copy().drop_duplicates()

del df_file

df_map.drop(['PrimaryKit','Segment cM','Matched Sex'], axis=1, inplace=True)

df_map.columns = ['Kit', 'Chr.', 'Start', 'End', 'SNPs', 'Name', 'Email']

# Initialize the output dataframe that will print results to a file.
# This will be the same whether phase 2 is run or only phase 1.
df_out = pd.DataFrame(columns=['Chr.','Start','End','cM'])

chr_list = list(range(1, 23)) + ['X']
# Loop through all 23 chromosomes
for chro_i in chr_list:
  chro_i_str = str(chro_i)
  df = df_raw[df_raw['Chr.'] == chro_i_str].copy().drop(['Chr.'], axis=1)
  
  # Need to sort by Start, then by cM, otherwise a smaller segment could get saved even if it's followed by a larger segment w/ the same start value (because the first segment has Overlap = NaN)
  df = df.sort_values(['Start', 'cM'], ascending=[True, False]).reset_index(drop=True)

  df['MaxEnd'] = df['End'].cummax()

  # Drop segments that are fully contained w/in a previous segment.
  df.drop(df[df['MaxEnd'] > df['End']].index, inplace=True)

  df.drop('MaxEnd', axis=1, inplace=True)

  # This will be for phase 1
  df1 = df.copy()

  df1 = df1.sort_values(['cM', 'Start'], ascending=[False, True]).reset_index(drop=True)
  
  # df1out = df1.loc[0][:] # creates a series
  df1out = pd.DataFrame({'Start' : [df1.loc[0][0]], 'End' : [df1.loc[0][1]], 'cM' : [df1.loc[0][2]]})
  df1out = df1out.astype({"Start":'int', "End":'int'})

  cols = list(df1out.columns)
  a, b, c = cols.index('End'), cols.index('Start'), cols.index('cM')
  cols[b], cols[a], cols[c] = cols[a], cols[b], cols[c]
  df1out = df1out[cols]
    
  new = [False] # 0 for old, 1 for new
  # Make a column called 'New'. New rows are the only ones that can be dropped from the output.
  df1out.insert(loc=3, column='New', value=new)

  # Find non-overlapping segments for phase 1. This could be a function since the entire ‘for loop’ is called twice in this program.
  for i in range(1, len(df1.index)):

    # Append a new (ith) row to the ouput
    newrow = pd.DataFrame({'Start' : [df1.loc[i][0]], 'End' : [df1.loc[i][1]], 'cM' : [df1.loc[i][2]], 'New' : True})
    df1out = df1out.append(newrow.loc[0, :])
    
    # Overlap will be calculated between a segment and the previous one, so rows need to be sorted based on start values.
    df1out = df1out.sort_values(['Start', 'cM'], ascending=[True, False]).reset_index(drop=True)

    # Calculate the overlap between a segment and the one from the previous row
    df1out['Overlap'] = df1out['End'].shift(1) - df1out['Start']
    
    # Checks overlap for all rows in case the new row has too much overlap.
    # Maybe in the future change it to check only the new row and those before/after it, which would be faster
    if (df1out['Overlap'] > 1000000).any():

      # Drop new seg. if any rows have overlap
      df1out.drop(df1out[df1out['New'] == True].index, inplace=True)

    # Need to change any New = 1 to New = 0 before the next iteration in case a new row remains in the dataframe
    df1out.loc[:,'New'] = 0
  # End of 'for loop' to find non-overlapping segs. for a given chromosome
  
  del df1

  df1out.drop(['New', 'Overlap'], axis=1, inplace=True)
  # Need to keep df1out for phase 2
 
  #################
  # Begin Phase 2 #
  #################

  # Keep only rows that didn't make it to the output of phase 1. The following method will only work if duplicates have already been removed from both dataframes (which is true):
  df2 = pd.concat([df, df1out]).drop_duplicates(keep=False).sort_values(['cM', 'Start'],ascending=[False, True]).reset_index(drop=True)

  del df
  # Most of the rest of this code is trying to find a large cM row that overlaps a Phase 1 output row by ~50%

  # Need to delete everything except for 6-10 segs. or so using df1out for better coverage and to make it likely that the Phase 2 output is from the other homologue.
  df1_test = df1out.copy().sort_values(['cM', 'Start'], ascending=[False, True]).reset_index(drop=True)
  df1_test = df1_test.astype({"Start":'int', "End":'int'})
  
  # Use the 6 largest segs. from Phase 1 to find a good overlapping seg. (may include more in the future for higher likelyhood of finding on on the opposite homologue)
  df1_test = df1_test.iloc[0:5]

  # Ensure decent coverage from df2_test
  df1_start = df1_test['Start'].min()
  df1_end = df1_test['End'].max()

  # Could change this 7, maybe to the average number of segments (minus one) in the output of Phase 1:
  bin_num = 7 # Makes n + 1 bins

  bin_start = list(range(0, df1_end - int((df1_end - df1_start)/bin_num), int((df1_end - df1_start)/bin_num)))

  bin_end = list(range(int((df1_end - df1_start)/bin_num), df1_end, int((df1_end - df1_start)/bin_num)))

  df2_test = pd.DataFrame({'Start' : bin_start, 'End' : bin_end})
  cols = list(df2_test.columns)
  a, b = cols.index('End'), cols.index('Start')
  cols[b], cols[a] = cols[a], cols[b]
  df2_test = df2_test[cols]
  df2_test['cM'] = 0

  df2_test = pd.concat([df2, df2_test]).sort_values(['Start', 'cM'], ascending=[True, False]).reset_index(drop=True)
  # df2_test will become really large, but the next line will only keep the largest cM rows in between the ones made above w/ 0 cM
  
  # Resulting segs. are large w/ good coverage for comparing to df1_out. Have confirmed that some can overlap.
  df2_test.drop(df2_test[(df2_test['cM'] == 0) | (df2_test['cM'] <= df2_test['cM'].shift(1))].index, inplace=True)

  # Make sure df2 is sorted by cM and indices are reset
  # Change: prob. should add back in like the 10 largest segs. just in case the above ones don't have good overlap
  df2_test = pd.concat([df2.iloc[0:20], df2_test]).drop_duplicates().reset_index(drop=True)

  df2_test_out = pd.DataFrame(columns=['Start','End','cM','Overlap'])
  for i in range(0, len(df2_test.index)):
    for j in range(0, len(df1_test.index)):
    # Ensure there is overlap:
      if (df2_test.loc[i]['Start'] < df1_test.loc[j]['End']) & (df1_test.loc[j]['Start'] < df2_test.loc[i]['End']):

        overlap = df2_test.loc[i]['End'] - df1_test.loc[j]['Start']

        if overlap < 0:
          overlap = df1_test.loc[j]['End'] - df2_test.loc[i]['Start']

        # Get the abs. fraction of overlap
        overlap = abs(df2_test.loc[i]['End'] - df2_test.loc[i]['Start'])/overlap

        if (0.25 < overlap) & (overlap < 0.75):

          newrow = pd.DataFrame({'Start' : [df2_test.loc[i]['Start']], 'End' : [df2_test.loc[i]['End']], 'cM' : [df2_test.loc[i]['cM']], 'Overlap' : overlap})

          df2_test_out = df2_test_out.append(newrow.loc[0, :])
        
          # Break the inner (j) loop, since df1_test is only non-overlapping segs.
          break

  del df1_test
  del df2_test
  
  df2_test_out['Overlap'] = abs(0.5 - df2_test_out['Overlap'])
    
  df2_test_out['Overlap'] = (df2_test_out['cM']**3)/(df2_test_out['Overlap'])

  df2out = df2_test_out[df2_test_out['Overlap'] == df2_test_out['Overlap'].max()].copy()
  df2out.drop('Overlap', axis=1, inplace=True)

  del df2_test_out

  new = [False] # 0 for old, 1 for new
  df2out.insert(loc=3, column='New', value=new)

  # Find non-overlapping segments for phase 2. This could be a function since the entire ‘for loop’ is called twice in this program.
  for i in range(1, len(df2.index)):

    # Append a new (ith) row to the ouput
    newrow = pd.DataFrame({'Start' : [df2.loc[i][0]], 'End' : [df2.loc[i][1]], 'cM' : [df2.loc[i][2]], 'New' : True})
    df2out = df2out.append(newrow.loc[0, :])

    # Overlap will be calculated between a segment and the previous one, so rows need to be sorted based on start values.
    df2out = df2out.sort_values(['Start', 'cM'], ascending=[True, False]).reset_index(drop=True)

    # Calculate the overlap between a segment and the one from the previous row
    df2out['Overlap'] = df2out['End'].shift(1) - df2out['Start']

    # Checks overlap for all rows in case the new row has too much overlap.
    # Maybe in the future change it to check only the new row and those before/after it, which would be faster
    if (df2out['Overlap'] > 1000000).any():

      # Drop new seg. if any rows have overlap
      df2out.drop(df2out[df2out['New'] == True].index, inplace=True)

    # Need to change any New = 1 to New = 0 before the next iteration in case a new row remains in the dataframe
    df2out.loc[:,'New'] = 0
    
  del df2
  # End of 'for loop' to find non-overlapping segs. for a given chromosome
  
  df1out.insert(loc=0, column='Chr.', value=chro_i_str)
  df1out.loc[:,'Chr.'] = chro_i_str
    
  df2out.drop(['New', 'Overlap'], axis=1, inplace=True)
  df2out.insert(loc=0, column='Chr.', value=chro_i_str)
  df2out.loc[:,'Chr.'] = chro_i_str

  df_out = df_out.append(df1out)
  del df1out
  df_out = df_out.append(df2out)
  del df2out

# 'For loop' ends here for chrs. 1-22

df_out = df_out.astype({"Start":'int', "End":'int'})

df_out = df_out.merge(df_map, how='left', on=['Chr.','Start','End'], copy=False)

cols = list(df_out.columns)
a, b, c, d, e, f, g, h = cols.index('Chr.'), cols.index('Start'), cols.index('End'), cols.index('Name'), cols.index('Kit'), cols.index('cM'), cols.index('SNPs'), cols.index('Email')
cols[a], cols[b], cols[c], cols[f], cols[e], cols[g], cols[d], cols[h] = cols[a], cols[b], cols[c], cols[d], cols[e], cols[f], cols[g], cols[h]
df_out = df_out[cols]
print(df_out)

# Need to drop index first or while printing
df_out.to_csv('Your GEDmatch Distinct Segment Output File Name.csv', index=False)

Feel free to ask me about modeling & simulation, genetic genealogy, or genealogical research. To see my articles on Medium, click here. And try out a nifty calculator that’s based on the first of my three genetic models. It lets you find the amount of an ancestor’s DNA you have when combined with various relatives. And most importantly, check out these ranges of shared DNA percentages or shared centiMorgans, which are the only published values that match known standard deviations.